Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
10962ae6
Commit
10962ae6
authored
Jun 06, 2022
by
rocking
Browse files
Merge commit '
1677cf70
' into gemm_add_bias_reduction
parents
300ac4e4
1677cf70
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
307 additions
and
480 deletions
+307
-480
include/ck/utility/reduction_operator.hpp
include/ck/utility/reduction_operator.hpp
+7
-10
library/include/ck/library/host_tensor/host_reduce_util.hpp
library/include/ck/library/host_tensor/host_reduce_util.hpp
+0
-257
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+33
-38
profiler/include/profile_batched_gemm_reduce_impl.hpp
profiler/include/profile_batched_gemm_reduce_impl.hpp
+2
-2
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+2
-2
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+12
-10
script/parse_perf_data.py
script/parse_perf_data.py
+199
-109
script/profile_conv.sh
script/profile_conv.sh
+52
-52
No files found.
include/ck/utility/reduction_operator.hpp
View file @
10962ae6
...
@@ -36,7 +36,7 @@ namespace reduce {
...
@@ -36,7 +36,7 @@ namespace reduce {
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least
// class must provide at least
// three members:
// three members:
// 1) Get
ReductionZero
Val() -- the interface to return the "identity element" for the binary
// 1) Get
Identity
Val
ue
() -- the interface to return the "identity element" for the binary
// operator, "identity element" is the unique
// operator, "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements
// element in the algebraic space that doesn't affect the value of other elements
// when operated against them, and the concept is similar to zero vector in
// when operated against them, and the concept is similar to zero vector in
...
@@ -59,7 +59,7 @@ struct Add
...
@@ -59,7 +59,7 @@ struct Add
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
@@ -76,7 +76,7 @@ struct Mul
...
@@ -76,7 +76,7 @@ struct Mul
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
static_cast
<
T
>
(
1.0
f
);
};
__device__
static
constexpr
bool
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
@@ -92,7 +92,7 @@ struct Max
...
@@ -92,7 +92,7 @@ struct Max
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
{
return
NumericLimits
<
T
>::
Lowest
();
return
NumericLimits
<
T
>::
Lowest
();
};
};
...
@@ -125,10 +125,7 @@ struct Min
...
@@ -125,10 +125,7 @@ struct Min
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
GetReductionZeroVal
()
__host__
__device__
static
constexpr
T
GetIdentityValue
()
{
return
NumericLimits
<
T
>::
Max
();
};
{
return
NumericLimits
<
T
>::
Max
();
};
__device__
static
constexpr
bool
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
@@ -158,7 +155,7 @@ struct AMax
...
@@ -158,7 +155,7 @@ struct AMax
{
{
using
dataType
=
T
;
using
dataType
=
T
;
__host__
__device__
static
constexpr
T
Get
ReductionZero
Val
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__host__
__device__
static
constexpr
T
Get
Identity
Val
ue
()
{
return
static_cast
<
T
>
(
0.0
f
);
};
__device__
static
constexpr
bool
__device__
static
constexpr
bool
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
IsCompatibleInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
...
@@ -184,7 +181,7 @@ struct AMax
...
@@ -184,7 +181,7 @@ struct AMax
};
};
template
<
typename
T
>
template
<
typename
T
>
T
Get
ReductionZero
ValueForInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
T
Get
Identity
Value
ue
ForInMemoryDataOperation
(
InMemoryDataOperationEnum
operation
)
{
{
T
result
=
ck
::
type_convert
<
T
>
(
0.0
f
);
T
result
=
ck
::
type_convert
<
T
>
(
0.0
f
);
...
...
library/include/ck/library/host_tensor/host_reduce_util.hpp
deleted
100644 → 0
View file @
300ac4e4
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <limits>
#include <cmath>
#include <functional>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace
ck
{
namespace
host_reduce
{
using
ck
::
NanPropagation
;
using
ck
::
ReduceTensorOp
;
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PreUnaryOpFn
(
int
)
{
using
ck
::
math
::
abs
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM1
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
a_
*
a_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
abs
(
a_
);
});
}
else
{
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return
([
&
](
AccDataType
&
)
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
)
>
PosUnaryOpFn
(
int32_t
divider
)
{
using
std
::
sqrt
;
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
)
{
a_
=
sqrt
(
a_
);
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AVG
)
{
return
([
&
,
divider
](
AccDataType
&
a_
)
{
a_
=
a_
/
static_cast
<
AccDataType
>
(
static_cast
<
float
>
(
divider
));
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return
([
&
](
AccDataType
&
)
{});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
ReduceOpFn
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
ADD
||
ReduceOpId
==
ReduceTensorOp
::
AVG
||
ReduceOpId
==
ReduceTensorOp
::
NORM1
||
ReduceOpId
==
ReduceTensorOp
::
NORM2
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
+
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
a_
=
a_
*
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
>
b_
)
a_
=
b_
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
)
{
if
(
a_
<
b_
)
a_
=
b_
;
});
}
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
changed
)
>
ReduceOpFn2
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
>
b_
)
{
a_
=
b_
;
changed
=
true
;
}
else
changed
=
false
;
});
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
||
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
([
&
](
AccDataType
&
a_
,
AccDataType
b_
,
bool
&
changed
)
{
if
(
a_
<
b_
)
{
a_
=
b_
;
changed
=
true
;
}
else
changed
=
false
;
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
{});
};
};
template
<
typename
AccDataType
,
ReduceTensorOp
ReduceOpId
>
__host__
static
inline
AccDataType
ReduceOpZeroVal
()
{
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MUL
)
{
return
(
static_cast
<
AccDataType
>
(
1.0
f
));
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MIN
)
{
return
(
ck
::
NumericLimits
<
AccDataType
>::
Max
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
MAX
)
{
return
(
ck
::
NumericLimits
<
AccDataType
>::
Lowest
());
}
else
if
constexpr
(
ReduceOpId
==
ReduceTensorOp
::
AMAX
)
{
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
}
else
{
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return
(
static_cast
<
AccDataType
>
(
0.0
f
));
};
};
template
<
typename
AccDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
)
{
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
opReduce
(
accuVal
,
currVal
);
}
else
{
if
(
isnan
(
currVal
))
accuVal
=
currVal
;
else
opReduce
(
accuVal
,
currVal
);
};
};
template
<
typename
AccDataType
,
typename
IndexDataType
,
bool
PropagateNan
>
__host__
static
inline
void
binop_with_index_and_nan_check
(
std
::
function
<
void
(
AccDataType
&
,
AccDataType
,
bool
&
)
>
opReduce
,
AccDataType
&
accuVal
,
AccDataType
currVal
,
IndexDataType
&
accuIndex
,
IndexDataType
currIndex
)
{
using
ck
::
math
::
isnan
;
if
constexpr
(
!
PropagateNan
)
{
bool
changed
;
opReduce
(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
}
else
{
if
(
isnan
(
currVal
))
{
accuVal
=
currVal
;
accuIndex
=
currIndex
;
}
else
{
bool
changed
;
opReduce
(
accuVal
,
currVal
,
changed
);
if
(
changed
)
accuIndex
=
currIndex
;
};
};
};
};
// namespace host_reduce
};
// namespace ck
#endif
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
10962ae6
...
@@ -33,10 +33,10 @@
...
@@ -33,10 +33,10 @@
#include "reduction_enums.hpp"
#include "reduction_enums.hpp"
#include "reduction_common.hpp"
#include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_common_util.hpp"
#include "host_common_util.hpp"
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
#include "data_type.hpp"
#include "reduction_functions_accumulate.hpp"
template
<
int
NDim
>
template
<
int
NDim
>
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
static
void
get_all_indexes
(
const
std
::
array
<
size_t
,
NDim
>&
dimLengths
,
...
@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
...
@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
template
<
typename
InDataType
,
template
<
typename
InDataType
,
typename
AccDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
OutDataType
,
ck
::
ReduceTensorOp
ReduceOpId
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
int
Rank
,
int
Rank
,
int
NumReduceDim
,
int
NumReduceDim
,
bool
PropagateNan
,
bool
PropagateNan
,
bool
NeedIndices
>
bool
OutputIndex
>
struct
ReductionHost
struct
ReductionHost
{
{
using
IndexDataType
=
int32_t
;
using
IndexDataType
=
int32_t
;
...
@@ -122,8 +124,6 @@ struct ReductionHost
...
@@ -122,8 +124,6 @@ struct ReductionHost
std
::
vector
<
int
>
reduceDims
;
std
::
vector
<
int
>
reduceDims
;
IndexDataType
divider
;
IndexDataType
divider
;
std
::
function
<
void
(
AccDataType
&
)
>
preUnaryOp
;
std
::
function
<
void
(
AccDataType
&
)
>
posUnaryOp
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceLengths
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumReduceDim
>
reduceStrides
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
std
::
array
<
size_t
,
NumInvariantDim
>
invariantLengths
;
...
@@ -137,9 +137,6 @@ struct ReductionHost
...
@@ -137,9 +137,6 @@ struct ReductionHost
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
invariantDims_
,
const
std
::
vector
<
int
>&
reduceDims_
)
const
std
::
vector
<
int
>&
reduceDims_
)
{
{
using
ck
::
host_reduce
::
PosUnaryOpFn
;
using
ck
::
host_reduce
::
PreUnaryOpFn
;
// this->outLengths = to_int_vector(outDesc.GetLengths());
// this->outLengths = to_int_vector(outDesc.GetLengths());
this
->
outStrides
=
outDesc
.
GetStrides
();
this
->
outStrides
=
outDesc
.
GetStrides
();
...
@@ -171,9 +168,6 @@ struct ReductionHost
...
@@ -171,9 +168,6 @@ struct ReductionHost
invariant_dim_indexes
.
clear
();
invariant_dim_indexes
.
clear
();
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
get_all_indexes
<
NumInvariantDim
>
(
invariantLengths
,
invariant_dim_indexes
);
};
};
preUnaryOp
=
PreUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
posUnaryOp
=
PosUnaryOpFn
<
AccDataType
,
ReduceOpId
>
(
divider
);
};
};
void
Run
(
float
alpha
,
void
Run
(
float
alpha
,
...
@@ -182,7 +176,7 @@ struct ReductionHost
...
@@ -182,7 +176,7 @@ struct ReductionHost
OutDataType
*
out_data
,
OutDataType
*
out_data
,
IndexDataType
*
out_indices
)
IndexDataType
*
out_indices
)
{
{
if
constexpr
(
NeedIndices
)
if
constexpr
(
OutputIndex
)
{
{
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
RunImpl_with_index
(
alpha
,
in_data
,
beta
,
out_data
,
out_indices
);
}
}
...
@@ -201,15 +195,17 @@ struct ReductionHost
...
@@ -201,15 +195,17 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_index_and_nan_check
;
using
ck
::
host_reduce
::
ReduceOpFn2
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce2
=
ReduceOpFn2
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithIndexAndNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
,
IndexDataType
>
;
InElementwiseOperation
in_elementwise_op
(
divider
);
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
NumInvariantDim
==
0
)
if
constexpr
(
NumInvariantDim
==
0
)
{
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
IndexDataType
accuIndex
=
0
;
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
for
(
std
::
size_t
i
=
0
;
i
<
reduce_dim_indexes
.
size
();
i
++
)
...
@@ -219,15 +215,14 @@ struct ReductionHost
...
@@ -219,15 +215,14 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
binop_with_index_and_nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
@@ -241,7 +236,7 @@ struct ReductionHost
...
@@ -241,7 +236,7 @@ struct ReductionHost
else
else
{
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
IndexDataType
accuIndex
=
0
;
IndexDataType
accuIndex
=
0
;
auto
offset_invariant
=
auto
offset_invariant
=
...
@@ -255,15 +250,14 @@ struct ReductionHost
...
@@ -255,15 +250,14 @@ struct ReductionHost
auto
currVal
=
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
auto
currIndex
=
static_cast
<
IndexDataType
>
(
i
);
binop_with_index_and_nan_check
<
AccDataType
,
IndexDataType
,
PropagateNan
>
(
Accumulation
::
Calculate
(
accuVal
,
currVal
,
accuIndex
,
currIndex
);
opReduce2
,
accuVal
,
currVal
,
accuIndex
,
currIndex
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
@@ -308,15 +302,16 @@ struct ReductionHost
...
@@ -308,15 +302,16 @@ struct ReductionHost
using
ck
::
float_equal_one
;
using
ck
::
float_equal_one
;
using
ck
::
float_equal_zero
;
using
ck
::
float_equal_zero
;
using
ck
::
type_convert
;
using
ck
::
type_convert
;
using
ck
::
host_reduce
::
binop_with_nan_check
;
using
ck
::
host_reduce
::
ReduceOpFn
;
using
ck
::
host_reduce
::
ReduceOpZeroVal
;
auto
opReduce
=
ReduceOpFn
<
AccDataType
,
ReduceOpId
>
();
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
InElementwiseOperation
in_elementwise_op
(
divider
);
AccElementwiseOperation
acc_elementwise_op
(
divider
);
if
constexpr
(
NumInvariantDim
==
0
)
if
constexpr
(
NumInvariantDim
==
0
)
{
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
for
(
const
auto
&
reduce_index
:
reduce_dim_indexes
)
{
{
...
@@ -325,12 +320,12 @@ struct ReductionHost
...
@@ -325,12 +320,12 @@ struct ReductionHost
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
@@ -343,7 +338,7 @@ struct ReductionHost
...
@@ -343,7 +338,7 @@ struct ReductionHost
else
else
{
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
auto
thread_reduce_func
=
[
&
](
auto
invariant_index
)
{
AccDataType
accuVal
=
ReduceOp
Z
er
oVal
<
AccDataType
,
ReduceOpId
>
();
AccDataType
accuVal
=
ReduceOper
ation
::
GetIdentityValue
();
auto
offset_invariant
=
auto
offset_invariant
=
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
get_offset_from_index
<
NumInvariantDim
>
(
invariantStrides
,
invariant_index
);
...
@@ -356,12 +351,12 @@ struct ReductionHost
...
@@ -356,12 +351,12 @@ struct ReductionHost
auto
currVal
=
auto
currVal
=
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
type_convert
<
AccDataType
>
(
in_data
[
offset_invariant
+
offset_reduce
]);
preUnaryOp
(
currVal
);
in_elementwise_op
(
currVal
,
currVal
);
binop_with_nan_check
<
AccDataType
,
PropagateNan
>
(
opReduce
,
accuVal
,
currVal
);
Accumulation
::
Calculate
(
accuVal
,
currVal
);
};
};
posUnaryOp
(
accuVal
);
acc_elementwise_op
(
accuVal
,
accuVal
);
if
(
!
float_equal_one
{}(
alpha
))
if
(
!
float_equal_one
{}(
alpha
))
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
accuVal
*=
type_convert
<
AccDataType
>
(
alpha
);
...
...
profiler/include/profile_batched_gemm_reduce_impl.hpp
View file @
10962ae6
...
@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
...
@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
{
{
for
(
int
m
=
0
;
m
<
M
;
++
m
)
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
{
float
d0_acc
=
d0_reduce_op
.
Get
ReductionZero
Val
();
float
d0_acc
=
d0_reduce_op
.
Get
Identity
Val
ue
();
float
d1_acc
=
d1_reduce_op
.
Get
ReductionZero
Val
();
float
d1_acc
=
d1_reduce_op
.
Get
Identity
Val
ue
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
{
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
10962ae6
...
@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification,
...
@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification,
for
(
int
m
=
0
;
m
<
M
;
++
m
)
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
{
float
d0_acc
=
d0_reduce_op
.
Get
ReductionZero
Val
();
float
d0_acc
=
d0_reduce_op
.
Get
Identity
Val
ue
();
float
d1_acc
=
d1_reduce_op
.
Get
ReductionZero
Val
();
float
d1_acc
=
d1_reduce_op
.
Get
Identity
Val
ue
();
for
(
int
n
=
0
;
n
<
N
;
++
n
)
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
{
...
...
profiler/include/profile_reduce_impl.hpp
View file @
10962ae6
...
@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
{
{
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
tensor_operation
::
device
::
device_reduce_instance
;
using
namespace
ck
::
host_reduce
;
using
ck
::
host_common
::
dumpBufferToFile
;
using
ck
::
host_common
::
dumpBufferToFile
;
constexpr
bool
op_support_indices
=
constexpr
bool
op_support_indices
=
...
@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
float
best_avg_time
=
0
;
float
best_avg_time
=
0
;
float
best_gb_per_sec
=
0
;
float
best_gb_per_sec
=
0
;
using
InElementwiseOperation
_0
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
InElementwiseOperation
;
using
AccElementwiseOperation
_0
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
AccElementwiseOperation
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
DeviceReduceInstPtr0
=
using
DeviceReduceInstPtr0
=
DeviceReducePtr
<
InElementwiseOperation
_0
,
AccElementwiseOperation
_0
>
;
DeviceReducePtr
<
InElementwiseOperation
,
AccElementwiseOperation
>
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
std
::
vector
<
DeviceReduceInstPtr0
>
reduce0_ptrs
;
...
@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
ReductionHost
<
InDataType
,
ReductionHost
<
InDataType
,
AccDataType
,
AccDataType
,
OutDataType
,
OutDataType
,
ReduceOpId
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
,
NumReduceDim
,
PropagateNan
,
PropagateNan
,
...
@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
for
(
auto
&
reduce_ptr
:
reduce0_ptrs
)
{
{
InElementwiseOperation_0
in_elementwise_op_0
(
static_cast
<
int32_t
>
(
reduce_total_length
));
InElementwiseOperation
in_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
AccElementwiseOperation_0
acc_elementwise_op_0
(
AccElementwiseOperation
acc_elementwise_op
(
static_cast
<
int32_t
>
(
reduce_total_length
));
static_cast
<
int32_t
>
(
reduce_total_length
));
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
auto
argument_ptr
=
reduce_ptr
->
MakeArgumentPointer
(
i_inLengths
,
i_inStrides
,
i_inStrides
,
...
@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
...
@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
nullptr
,
nullptr
,
out_dev
.
GetDeviceBuffer
(),
out_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
out_indices_dev
.
GetDeviceBuffer
(),
in_elementwise_op
_0
,
in_elementwise_op
,
acc_elementwise_op
_0
);
acc_elementwise_op
);
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
reduce_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
continue
;
continue
;
...
...
script/parse_perf_data.py
View file @
10962ae6
#!/usr/bin/env python3
#!/usr/bin/env python3
import
os
,
io
,
argparse
,
datetime
import
os
,
io
,
argparse
,
datetime
,
re
import
numpy
as
np
import
numpy
as
np
import
sqlalchemy
import
sqlalchemy
from
sqlalchemy.types
import
NVARCHAR
,
Float
,
Integer
from
sqlalchemy.types
import
NVARCHAR
,
Float
,
Integer
...
@@ -45,66 +45,91 @@ def main():
...
@@ -45,66 +45,91 @@ def main():
StrideB
=
[]
StrideB
=
[]
StrideC
=
[]
StrideC
=
[]
#parse results, get the Tflops value for "Best Perf" kernels
#parse results, get the Tflops value for "Best Perf" kernels
glue
=
""
glue
=
""
for
filename
in
args
.
files
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
for
line
in
open
(
filename
):
if
'Branch name'
in
line
:
if
'Branch name'
in
line
:
lst
=
line
.
split
()
lst
=
line
.
split
()
branch_name
=
lst
[
2
]
branch_name
=
lst
[
2
]
for
filename
in
args
.
files
:
if
'Node name'
in
line
:
for
line
in
open
(
filename
):
lst
=
line
.
split
()
if
'Best Perf'
in
line
:
node_id
=
lst
[
2
]
if
'GPU_arch'
in
line
:
lst
=
line
.
split
()
gpu_arch
=
lst
[
1
]
if
'HIP version'
in
line
:
lst
=
line
.
split
()
lst
=
line
.
split
()
if
len
(
lst
)
>=
37
:
#the line is complete
hip_vers
=
lst
[
2
]
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
if
'InstalledDir'
in
line
:
kernels
.
append
(
glue
.
join
(
lst
[
37
:]))
lst
=
line
.
split
()
tflops
.
append
(
lst
[
33
])
rocm_vers
=
lst
[
1
][
lst
[
1
].
find
(
'/opt/rocm-'
)
+
len
(
'/opt/rocm-'
):
lst
[
1
].
rfind
(
'/llvm/bin'
)]
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
elif
len
(
lst
)
<
37
and
len
(
lst
)
>=
33
:
#the tflops are available
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
"N/A"
)
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
print
(
"warning: incomplete line:"
,
lst
)
elif
len
(
lst
)
<
33
:
#even the tflops are not available
print
(
"Error in ckProfiler output!"
)
print
(
"warning: incomplete line="
,
lst
)
#sort results
print
(
"Number of tests:"
,
len
(
tests
))
print
(
"Branch name:"
,
branch_name
)
print
(
"Branch name:"
,
branch_name
)
#sorted_tests = sorted(tests)
print
(
"Node name:"
,
node_id
)
#print("sorted tests:",sorted_tests)
print
(
"GPU_arch:"
,
gpu_arch
)
sorted_tflops
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
print
(
"ROCM_version:"
,
rocm_vers
)
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
print
(
"HIP_version:"
,
hip_vers
)
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
#parse gemm performance tests:
if
'gemm'
in
filename
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
if
len
(
lst
)
>=
37
:
#the line is complete
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
glue
.
join
(
lst
[
37
:]))
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
elif
len
(
lst
)
<
37
and
len
(
lst
)
>=
33
:
#the tflops are available
tests
.
append
(
glue
.
join
(
lst
[
5
:
30
]))
kernels
.
append
(
"N/A"
)
tflops
.
append
(
lst
[
33
])
dtype
.
append
(
lst
[
5
])
alayout
.
append
(
lst
[
8
])
blayout
.
append
(
lst
[
11
])
M
.
append
(
lst
[
14
])
N
.
append
(
lst
[
17
])
K
.
append
(
lst
[
20
])
StrideA
.
append
(
lst
[
23
])
StrideB
.
append
(
lst
[
26
])
StrideC
.
append
(
lst
[
29
])
print
(
"warning: incomplete line:"
,
lst
)
elif
len
(
lst
)
<
33
:
#even the tflops are not available
print
(
"Error in ckProfiler output!"
)
print
(
"warning: incomplete line="
,
lst
)
#sort results
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops
=
[
x
for
_
,
x
in
sorted
(
zip
(
tests
,
tflops
))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list
=
list
(
range
(
1
,
len
(
tests
)
+
1
))
#parse resnet50 performance tests:
if
'resnet50'
in
filename
:
for
filename
in
args
.
files
:
for
line
in
open
(
filename
):
if
'Best Perf'
in
line
:
lst
=
line
.
split
()
tflops
.
append
(
lst
[
4
])
print
(
"Number of tests:"
,
len
(
tflops
))
sql_hostname
=
'127.0.0.1'
sql_hostname
=
'127.0.0.1'
sql_username
=
os
.
environ
[
"dbuser"
]
sql_username
=
os
.
environ
[
"dbuser"
]
print
(
"sql_username="
,
sql_username
)
sql_password
=
os
.
environ
[
"dbpassword"
]
sql_password
=
os
.
environ
[
"dbpassword"
]
sql_main_database
=
'miopen_perf'
sql_main_database
=
'miopen_perf'
sql_port
=
3306
sql_port
=
3306
ssh_host
=
os
.
environ
[
"dbsship"
]
ssh_host
=
os
.
environ
[
"dbsship"
]
print
(
"ssh_host="
,
ssh_host
)
ssh_user
=
os
.
environ
[
"dbsshuser"
]
ssh_user
=
os
.
environ
[
"dbsshuser"
]
print
(
"ssh_user="
,
ssh_user
)
ssh_port
=
int
(
os
.
environ
[
"dbsshport"
])
ssh_port
=
int
(
os
.
environ
[
"dbsshport"
])
ssh_pass
=
os
.
environ
[
"dbsshpassword"
]
ssh_pass
=
os
.
environ
[
"dbsshpassword"
]
...
@@ -118,75 +143,140 @@ def main():
...
@@ -118,75 +143,140 @@ def main():
format
(
sql_username
,
sql_password
,
sql_hostname
,
tunnel
.
local_bind_port
,
sql_main_database
))
format
(
sql_username
,
sql_password
,
sql_hostname
,
tunnel
.
local_bind_port
,
sql_main_database
))
conn
=
sqlEngine
.
connect
()
conn
=
sqlEngine
.
connect
()
#write the ck_gemm_test_params table
#save gemm performance tests:
#only needed once the test set changes
if
'gemm'
in
filename
:
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
#write the ck_gemm_test_params table
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
#only needed once the test set changes
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
'''
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
print(df)
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
dtypes = {
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
'Test_number': Integer(),
print(df)
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
dtypes = {
'Blayout': NVARCHAR(length=12),
'Test_number': Integer(),
'M': Integer(),
'Data_type': NVARCHAR(length=5),
'N': Integer(),
'Alayout': NVARCHAR(length=12),
'K': Integer(),
'Blayout': NVARCHAR(length=12),
'StrideA': Integer(),
'M': Integer(),
'StrideB': Integer(),
'N': Integer(),
'StrideC': Integer()
'K': Integer(),
}
'StrideA': Integer(),
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'StrideB': Integer(),
'''
'StrideC': Integer()
}
#read baseline results for the latest develop branch
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
query
=
'''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
'''
tflops_base
=
pd
.
read_sql_query
(
query
,
conn
)
#read baseline results for the latest develop branch
#write new results to the db
query
=
'''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
testlist
=
[]
tflops_base
=
pd
.
read_sql_query
(
query
,
conn
)
for
i
in
range
(
1
,
len
(
tests
)
+
1
):
testlist
.
append
(
"Test%i"
%
i
)
#write new results to the db
ck_gemm_tflops
=
[
str
(
branch_name
),
str
(
datetime
.
datetime
.
now
())]
testlist
=
[]
flops
=
pd
.
DataFrame
(
data
=
[
ck_gemm_tflops
],
columns
=
[
'Branch_ID'
,
'Datetime'
])
for
i
in
range
(
1
,
len
(
tests
)
+
1
):
df_add
=
pd
.
DataFrame
(
data
=
[
sorted_tflops
],
columns
=
testlist
)
testlist
.
append
(
"Test%i"
%
i
)
flops
=
pd
.
concat
([
flops
,
df_add
],
axis
=
1
)
ck_gemm_tflops
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
datetime
.
datetime
.
now
())]
print
(
"new tflops results:"
,
flops
)
flops
=
pd
.
DataFrame
(
data
=
[
ck_gemm_tflops
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'ROCM_version'
,
'HIP_version'
,
'Datetime'
])
flops
.
to_sql
(
"ck_gemm_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
df_add
=
pd
.
DataFrame
(
data
=
[
sorted_tflops
],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops
,
df_add
],
axis
=
1
)
print
(
"new tflops for gemm tests:"
,
flops
)
flops
.
to_sql
(
"ck_gemm_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
#save resnet50 performance tests:
if
'resnet50'
in
filename
:
#read baseline results for the latest develop branch
query
=
'''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256
=
pd
.
read_sql_query
(
query
,
conn
)
query
=
'''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4
=
pd
.
read_sql_query
(
query
,
conn
)
#write new results to the db
testlist
=
[]
for
i
in
range
(
1
,
50
):
testlist
.
append
(
"Layer%i"
%
i
)
ck_resnet_tflops
=
[
str
(
branch_name
),
str
(
node_id
),
str
(
gpu_arch
),
str
(
rocm_vers
),
str
(
hip_vers
),
str
(
datetime
.
datetime
.
now
())]
flops0
=
pd
.
DataFrame
(
data
=
[
ck_resnet_tflops
],
columns
=
[
'Branch_ID'
,
'Node_ID'
,
'GPU_arch'
,
'ROCM_version'
,
'HIP_version'
,
'Datetime'
])
df_add
=
pd
.
DataFrame
(
data
=
[
tflops
[
0
:
49
]],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops0
,
df_add
],
axis
=
1
)
print
(
"new tflops for N=256 resnet50 test:"
,
flops
)
flops
.
to_sql
(
"ck_resnet50_N256_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
df_add
=
pd
.
DataFrame
(
data
=
[
tflops
[
49
:
98
]],
columns
=
testlist
)
flops
=
pd
.
concat
([
flops0
,
df_add
],
axis
=
1
)
print
(
"new tflops for N=4 resnet50 test:"
,
flops
)
flops
.
to_sql
(
"ck_resnet50_N4_tflops"
,
conn
,
if_exists
=
'append'
,
index
=
False
)
conn
.
close
()
conn
.
close
()
#compare the results to the baseline
#compare the results to the baseline
if baseline exists
regression
=
0
regression
=
0
base
=
tflops_base
[
testlist
].
to_numpy
(
dtype
=
'float'
)
if
'gemm'
in
filename
:
base_list
=
base
[
0
]
if
not
tflops_base
.
empty
:
ave_perf
=
0
base
=
tflops_base
[
testlist
].
to_numpy
(
dtype
=
'float'
)
for
i
in
range
(
len
(
base_list
)):
base_list
=
base
[
0
]
# success criterion:
ave_perf
=
0
if
base_list
[
i
]
>
1.01
*
float
(
sorted_tflops
[
i
]):
for
i
in
range
(
len
(
base_list
)):
print
(
"test # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
# success criterion:
(
float
(
sorted_tflops
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
if
base_list
[
i
]
>
1.01
*
float
(
sorted_tflops
[
i
]):
regression
=
1
print
(
"test # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
ave_perf
=
ave_perf
+
float
(
sorted_tflops
[
i
])
/
base_list
[
i
]
(
float
(
sorted_tflops
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
if
regression
==
0
:
regression
=
1
print
(
"no regressions found"
)
ave_perf
=
ave_perf
+
float
(
sorted_tflops
[
i
])
/
base_list
[
i
]
ave_perf
=
ave_perf
/
len
(
base_list
)
if
regression
==
0
:
print
(
"average performance relative to baseline:"
,
ave_perf
)
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline"
)
if
'resnet50'
in
filename
:
if
not
tflops_base_N256
.
empty
:
base
=
tflops_base_N256
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
tflops
[
i
]):
print
(
"layer # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
tflops
[
i
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
tflops
[
i
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline for N=256"
)
if
not
tflops_base_N4
.
empty
:
base
=
tflops_base_N4
[
testlist
].
to_numpy
(
dtype
=
'float'
)
base_list
=
base
[
0
]
ave_perf
=
0
for
i
in
range
(
len
(
base_list
)):
# success criterion:
if
base_list
[
i
]
>
1.01
*
float
(
tflops
[
i
+
49
]):
print
(
"layer # "
,
i
,
"shows regression by {:.3f}%"
.
format
(
(
float
(
tflops
[
i
+
49
])
-
base_list
[
i
])
/
base_list
[
i
]
*
100
))
regression
=
1
ave_perf
=
ave_perf
+
float
(
tflops
[
i
+
49
])
/
base_list
[
i
]
if
regression
==
0
:
print
(
"no regressions found"
)
ave_perf
=
ave_perf
/
len
(
base_list
)
print
(
"average performance relative to baseline:"
,
ave_perf
)
else
:
print
(
"could not find a baseline for N=4"
)
#return 0 if performance criteria met, otherwise return 1
#return 0 if performance criteria met, otherwise return 1
return
regression
return
regression
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
script/profile_conv.sh
View file @
10962ae6
...
@@ -3,9 +3,9 @@
...
@@ -3,9 +3,9 @@
## GPU visibility
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
export
HIP_VISIBLE_DEVICES
=
0
make
-j
ckProfiler
#
make -j ckProfiler
DRIVER
=
".
/profiler
/ckProfiler"
DRIVER
=
".
./build/bin
/ckProfiler"
OP
=
$1
OP
=
$1
DATATYPE
=
$2
DATATYPE
=
$2
...
@@ -51,56 +51,56 @@ REPEAT=$9
...
@@ -51,56 +51,56 @@ REPEAT=$9
# Resnet50 from Bing
# Resnet50 from Bing
#############
####### op_________________
___
datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C_
__
Y X Hi_
_
Wi__ Strides Dilations LeftPads RightPads
####### op_________________
datatype in_layout wei_layout out_layout verify init log repeat
N__
K___ C_ Y X
Hi_ Wi__ Strides Dilations LeftPads RightPads
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 3 7 7 224 224 2 2 1 1 3 3 3 3
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 56 56 2 2 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 28 28 2 2 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 14 14 2 2 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler
conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
$DRIVER
conv_fwd_bias_relu
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler
conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
$DRIVER
conv_fwd_bias_relu_add
$DATATYPE
$IN_LAYOUT
$WEI_LAYOUT
$OUT_LAYOUT
$VERIFY
$INIT
$LOG
$REPEAT
$N
2048 512 1 1 7 7 1 1 1 1 0 0 0 0
# Resnet50
# Resnet50
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment