Commit 10962ae6 authored by rocking's avatar rocking
Browse files

Merge commit '1677cf70' into gemm_add_bias_reduction

parents 300ac4e4 1677cf70
...@@ -36,7 +36,7 @@ namespace reduce { ...@@ -36,7 +36,7 @@ namespace reduce {
// Every binary operator used in reduction is represented by a templated functor class. Each functor // Every binary operator used in reduction is represented by a templated functor class. Each functor
// class must provide at least // class must provide at least
// three members: // three members:
// 1) GetReductionZeroVal() -- the interface to return the "identity element" for the binary // 1) GetIdentityValue() -- the interface to return the "identity element" for the binary
// operator, "identity element" is the unique // operator, "identity element" is the unique
// element in the algebraic space that doesn't affect the value of other elements // element in the algebraic space that doesn't affect the value of other elements
// when operated against them, and the concept is similar to zero vector in // when operated against them, and the concept is similar to zero vector in
...@@ -59,7 +59,7 @@ struct Add ...@@ -59,7 +59,7 @@ struct Add
{ {
using dataType = T; using dataType = T;
__host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); }; __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(0.0f); };
__device__ static constexpr bool __device__ static constexpr bool
IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
...@@ -76,7 +76,7 @@ struct Mul ...@@ -76,7 +76,7 @@ struct Mul
{ {
using dataType = T; using dataType = T;
__host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(1.0f); }; __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(1.0f); };
__device__ static constexpr bool __device__ static constexpr bool
IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
...@@ -92,7 +92,7 @@ struct Max ...@@ -92,7 +92,7 @@ struct Max
{ {
using dataType = T; using dataType = T;
__host__ __device__ static constexpr T GetReductionZeroVal() __host__ __device__ static constexpr T GetIdentityValue()
{ {
return NumericLimits<T>::Lowest(); return NumericLimits<T>::Lowest();
}; };
...@@ -125,10 +125,7 @@ struct Min ...@@ -125,10 +125,7 @@ struct Min
{ {
using dataType = T; using dataType = T;
__host__ __device__ static constexpr T GetReductionZeroVal() __host__ __device__ static constexpr T GetIdentityValue() { return NumericLimits<T>::Max(); };
{
return NumericLimits<T>::Max();
};
__device__ static constexpr bool __device__ static constexpr bool
IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
...@@ -158,7 +155,7 @@ struct AMax ...@@ -158,7 +155,7 @@ struct AMax
{ {
using dataType = T; using dataType = T;
__host__ __device__ static constexpr T GetReductionZeroVal() { return static_cast<T>(0.0f); }; __host__ __device__ static constexpr T GetIdentityValue() { return static_cast<T>(0.0f); };
__device__ static constexpr bool __device__ static constexpr bool
IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation) IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
...@@ -184,7 +181,7 @@ struct AMax ...@@ -184,7 +181,7 @@ struct AMax
}; };
template <typename T> template <typename T>
T GetReductionZeroValueForInMemoryDataOperation(InMemoryDataOperationEnum operation) T GetIdentityValueueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
{ {
T result = ck::type_convert<T>(0.0f); T result = ck::type_convert<T>(0.0f);
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <limits>
#include <cmath>
#include <functional>
#include "reduction_enums.hpp"
#include "data_type.hpp"
#include "math_v2.hpp"
namespace ck {
namespace host_reduce {
using ck::NanPropagation;
using ck::ReduceTensorOp;
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{
using ck::math::abs;
if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
{
return ([&](AccDataType& a_) { a_ = abs(a_); });
}
else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
{
return ([&](AccDataType& a_) { a_ = a_ * a_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
return ([&](AccDataType& a_) { a_ = abs(a_); });
}
else
{
// ReduceTensorOp::AVG:
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
return ([&](AccDataType&) {});
};
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
{
using std::sqrt;
if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
{
return ([&](AccDataType& a_) { a_ = sqrt(a_); });
}
else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
{
return ([&, divider](AccDataType& a_) {
a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::MUL:
// ReduceTensorOp::MIN:
// ReduceTensorOp::MAX:
// ReduceTensorOp::AMAX:
return ([&](AccDataType&) {});
}
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
{
if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
{
return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
{
return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return ([&](AccDataType& a_, AccDataType b_) {
if(a_ > b_)
a_ = b_;
});
}
else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
{
return ([&](AccDataType& a_, AccDataType b_) {
if(a_ < b_)
a_ = b_;
});
}
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
{
if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ > b_)
{
a_ = b_;
changed = true;
}
else
changed = false;
});
}
else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
{
return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ < b_)
{
a_ = b_;
changed = true;
}
else
changed = false;
});
}
else
{
// ReduceTensorOp::ADD:
// ReduceTensorOp::MUL:
// ReduceTensorOp::AVG:
// ReduceTensorOp::NORM1:
// ReduceTensorOp::NORM2:
return (std::function<void(AccDataType&, AccDataType, bool&)>{});
};
};
template <typename AccDataType, ReduceTensorOp ReduceOpId>
__host__ static inline AccDataType ReduceOpZeroVal()
{
if constexpr(ReduceOpId == ReduceTensorOp::MUL)
{
return (static_cast<AccDataType>(1.0f));
}
else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
{
return (ck::NumericLimits<AccDataType>::Max());
}
else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
{
return (ck::NumericLimits<AccDataType>::Lowest());
}
else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
{
return (static_cast<AccDataType>(0.0f));
}
else
{
// ReduceTensorOp::ADD
// ReduceTensorOp::AVG
// ReduceTensorOp::NORM1
// ReduceTensorOp::NORM2
return (static_cast<AccDataType>(0.0f));
};
};
template <typename AccDataType, bool PropagateNan>
__host__ static inline void
binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
AccDataType& accuVal,
AccDataType currVal)
{
using ck::math::isnan;
if constexpr(!PropagateNan)
{
opReduce(accuVal, currVal);
}
else
{
if(isnan(currVal))
accuVal = currVal;
else
opReduce(accuVal, currVal);
};
};
template <typename AccDataType, typename IndexDataType, bool PropagateNan>
__host__ static inline void
binop_with_index_and_nan_check(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
AccDataType& accuVal,
AccDataType currVal,
IndexDataType& accuIndex,
IndexDataType currIndex)
{
using ck::math::isnan;
if constexpr(!PropagateNan)
{
bool changed;
opReduce(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
}
else
{
if(isnan(currVal))
{
accuVal = currVal;
accuIndex = currIndex;
}
else
{
bool changed;
opReduce(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
};
};
};
}; // namespace host_reduce
}; // namespace ck
#endif
...@@ -33,10 +33,10 @@ ...@@ -33,10 +33,10 @@
#include "reduction_enums.hpp" #include "reduction_enums.hpp"
#include "reduction_common.hpp" #include "reduction_common.hpp"
#include "host_reduce_util.hpp"
#include "host_common_util.hpp" #include "host_common_util.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
#include "data_type.hpp" #include "data_type.hpp"
#include "reduction_functions_accumulate.hpp"
template <int NDim> template <int NDim>
static void get_all_indexes(const std::array<size_t, NDim>& dimLengths, static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
...@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides, ...@@ -106,11 +106,13 @@ static size_t get_offset_from_index(const std::vector<size_t>& strides,
template <typename InDataType, template <typename InDataType,
typename AccDataType, typename AccDataType,
typename OutDataType, typename OutDataType,
ck::ReduceTensorOp ReduceOpId, typename ReduceOperation,
typename InElementwiseOperation,
typename AccElementwiseOperation,
int Rank, int Rank,
int NumReduceDim, int NumReduceDim,
bool PropagateNan, bool PropagateNan,
bool NeedIndices> bool OutputIndex>
struct ReductionHost struct ReductionHost
{ {
using IndexDataType = int32_t; using IndexDataType = int32_t;
...@@ -122,8 +124,6 @@ struct ReductionHost ...@@ -122,8 +124,6 @@ struct ReductionHost
std::vector<int> reduceDims; std::vector<int> reduceDims;
IndexDataType divider; IndexDataType divider;
std::function<void(AccDataType&)> preUnaryOp;
std::function<void(AccDataType&)> posUnaryOp;
std::array<size_t, NumReduceDim> reduceLengths; std::array<size_t, NumReduceDim> reduceLengths;
std::array<size_t, NumReduceDim> reduceStrides; std::array<size_t, NumReduceDim> reduceStrides;
std::array<size_t, NumInvariantDim> invariantLengths; std::array<size_t, NumInvariantDim> invariantLengths;
...@@ -137,9 +137,6 @@ struct ReductionHost ...@@ -137,9 +137,6 @@ struct ReductionHost
const std::vector<int>& invariantDims_, const std::vector<int>& invariantDims_,
const std::vector<int>& reduceDims_) const std::vector<int>& reduceDims_)
{ {
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
// this->outLengths = to_int_vector(outDesc.GetLengths()); // this->outLengths = to_int_vector(outDesc.GetLengths());
this->outStrides = outDesc.GetStrides(); this->outStrides = outDesc.GetStrides();
...@@ -171,9 +168,6 @@ struct ReductionHost ...@@ -171,9 +168,6 @@ struct ReductionHost
invariant_dim_indexes.clear(); invariant_dim_indexes.clear();
get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes); get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
}; };
preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
}; };
void Run(float alpha, void Run(float alpha,
...@@ -182,7 +176,7 @@ struct ReductionHost ...@@ -182,7 +176,7 @@ struct ReductionHost
OutDataType* out_data, OutDataType* out_data,
IndexDataType* out_indices) IndexDataType* out_indices)
{ {
if constexpr(NeedIndices) if constexpr(OutputIndex)
{ {
RunImpl_with_index(alpha, in_data, beta, out_data, out_indices); RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
} }
...@@ -201,15 +195,17 @@ struct ReductionHost ...@@ -201,15 +195,17 @@ struct ReductionHost
using ck::float_equal_one; using ck::float_equal_one;
using ck::float_equal_zero; using ck::float_equal_zero;
using ck::type_convert; using ck::type_convert;
using ck::host_reduce::binop_with_index_and_nan_check;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>(); using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
ReduceOperation,
AccDataType,
IndexDataType>;
InElementwiseOperation in_elementwise_op(divider);
AccElementwiseOperation acc_elementwise_op(divider);
if constexpr(NumInvariantDim == 0) if constexpr(NumInvariantDim == 0)
{ {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>(); AccDataType accuVal = ReduceOperation::GetIdentityValue();
IndexDataType accuIndex = 0; IndexDataType accuIndex = 0;
for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++) for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
...@@ -219,15 +215,14 @@ struct ReductionHost ...@@ -219,15 +215,14 @@ struct ReductionHost
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]); auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal); in_elementwise_op(currVal, currVal);
auto currIndex = static_cast<IndexDataType>(i); auto currIndex = static_cast<IndexDataType>(i);
binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>( Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
opReduce2, accuVal, currVal, accuIndex, currIndex);
}; };
posUnaryOp(accuVal); acc_elementwise_op(accuVal, accuVal);
if(!float_equal_one{}(alpha)) if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha); accuVal *= type_convert<AccDataType>(alpha);
...@@ -241,7 +236,7 @@ struct ReductionHost ...@@ -241,7 +236,7 @@ struct ReductionHost
else else
{ {
auto thread_reduce_func = [&](auto invariant_index) { auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>(); AccDataType accuVal = ReduceOperation::GetIdentityValue();
IndexDataType accuIndex = 0; IndexDataType accuIndex = 0;
auto offset_invariant = auto offset_invariant =
...@@ -255,15 +250,14 @@ struct ReductionHost ...@@ -255,15 +250,14 @@ struct ReductionHost
auto currVal = auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]); type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal); in_elementwise_op(currVal, currVal);
auto currIndex = static_cast<IndexDataType>(i); auto currIndex = static_cast<IndexDataType>(i);
binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>( Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
opReduce2, accuVal, currVal, accuIndex, currIndex);
}; };
posUnaryOp(accuVal); acc_elementwise_op(accuVal, accuVal);
if(!float_equal_one{}(alpha)) if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha); accuVal *= type_convert<AccDataType>(alpha);
...@@ -308,15 +302,16 @@ struct ReductionHost ...@@ -308,15 +302,16 @@ struct ReductionHost
using ck::float_equal_one; using ck::float_equal_one;
using ck::float_equal_zero; using ck::float_equal_zero;
using ck::type_convert; using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>(); using Accumulation =
ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
InElementwiseOperation in_elementwise_op(divider);
AccElementwiseOperation acc_elementwise_op(divider);
if constexpr(NumInvariantDim == 0) if constexpr(NumInvariantDim == 0)
{ {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>(); AccDataType accuVal = ReduceOperation::GetIdentityValue();
for(const auto& reduce_index : reduce_dim_indexes) for(const auto& reduce_index : reduce_dim_indexes)
{ {
...@@ -325,12 +320,12 @@ struct ReductionHost ...@@ -325,12 +320,12 @@ struct ReductionHost
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]); auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal); in_elementwise_op(currVal, currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal); Accumulation::Calculate(accuVal, currVal);
}; };
posUnaryOp(accuVal); acc_elementwise_op(accuVal, accuVal);
if(!float_equal_one{}(alpha)) if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha); accuVal *= type_convert<AccDataType>(alpha);
...@@ -343,7 +338,7 @@ struct ReductionHost ...@@ -343,7 +338,7 @@ struct ReductionHost
else else
{ {
auto thread_reduce_func = [&](auto invariant_index) { auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>(); AccDataType accuVal = ReduceOperation::GetIdentityValue();
auto offset_invariant = auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index); get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
...@@ -356,12 +351,12 @@ struct ReductionHost ...@@ -356,12 +351,12 @@ struct ReductionHost
auto currVal = auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]); type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal); in_elementwise_op(currVal, currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal); Accumulation::Calculate(accuVal, currVal);
}; };
posUnaryOp(accuVal); acc_elementwise_op(accuVal, accuVal);
if(!float_equal_one{}(alpha)) if(!float_equal_one{}(alpha))
accuVal *= type_convert<AccDataType>(alpha); accuVal *= type_convert<AccDataType>(alpha);
......
...@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -171,8 +171,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
{ {
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
float d0_acc = d0_reduce_op.GetReductionZeroVal(); float d0_acc = d0_reduce_op.GetIdentityValue();
float d1_acc = d1_reduce_op.GetReductionZeroVal(); float d1_acc = d1_reduce_op.GetIdentityValue();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
......
...@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -165,8 +165,8 @@ bool profile_gemm_reduce_impl(int do_verification,
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
float d0_acc = d0_reduce_op.GetReductionZeroVal(); float d0_acc = d0_reduce_op.GetIdentityValue();
float d1_acc = d1_reduce_op.GetReductionZeroVal(); float d1_acc = d1_reduce_op.GetIdentityValue();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
......
...@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -138,7 +138,6 @@ bool profile_reduce_impl_impl(bool do_verification,
{ {
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
using namespace ck::tensor_operation::device::device_reduce_instance; using namespace ck::tensor_operation::device::device_reduce_instance;
using namespace ck::host_reduce;
using ck::host_common::dumpBufferToFile; using ck::host_common::dumpBufferToFile;
constexpr bool op_support_indices = constexpr bool op_support_indices =
...@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -261,15 +260,17 @@ bool profile_reduce_impl_impl(bool do_verification,
float best_avg_time = 0; float best_avg_time = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
using InElementwiseOperation_0 = using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>:: typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
InElementwiseOperation; InElementwiseOperation;
using AccElementwiseOperation_0 = using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>:: typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
AccElementwiseOperation; AccElementwiseOperation;
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using DeviceReduceInstPtr0 = using DeviceReduceInstPtr0 =
DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>; DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs; std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
...@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -313,7 +314,9 @@ bool profile_reduce_impl_impl(bool do_verification,
ReductionHost<InDataType, ReductionHost<InDataType,
AccDataType, AccDataType,
OutDataType, OutDataType,
ReduceOpId, ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
Rank, Rank,
NumReduceDim, NumReduceDim,
PropagateNan, PropagateNan,
...@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -337,9 +340,8 @@ bool profile_reduce_impl_impl(bool do_verification,
for(auto& reduce_ptr : reduce0_ptrs) for(auto& reduce_ptr : reduce0_ptrs)
{ {
InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length)); InElementwiseOperation in_elementwise_op(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_0 acc_elementwise_op_0( AccElementwiseOperation acc_elementwise_op(static_cast<int32_t>(reduce_total_length));
static_cast<int32_t>(reduce_total_length));
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides, i_inStrides,
...@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -352,8 +354,8 @@ bool profile_reduce_impl_impl(bool do_verification,
nullptr, nullptr,
out_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
out_indices_dev.GetDeviceBuffer(), out_indices_dev.GetDeviceBuffer(),
in_elementwise_op_0, in_elementwise_op,
acc_elementwise_op_0); acc_elementwise_op);
if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
continue; continue;
......
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, io, argparse, datetime import os, io, argparse, datetime, re
import numpy as np import numpy as np
import sqlalchemy import sqlalchemy
from sqlalchemy.types import NVARCHAR, Float, Integer from sqlalchemy.types import NVARCHAR, Float, Integer
...@@ -45,66 +45,91 @@ def main(): ...@@ -45,66 +45,91 @@ def main():
StrideB=[] StrideB=[]
StrideC=[] StrideC=[]
#parse results, get the Tflops value for "Best Perf" kernels #parse results, get the Tflops value for "Best Perf" kernels
glue="" glue=""
for filename in args.files: for filename in args.files:
for line in open(filename): for line in open(filename):
if 'Branch name' in line: if 'Branch name' in line:
lst=line.split() lst=line.split()
branch_name=lst[2] branch_name=lst[2]
for filename in args.files: if 'Node name' in line:
for line in open(filename): lst=line.split()
if 'Best Perf' in line: node_id=lst[2]
if 'GPU_arch' in line:
lst=line.split()
gpu_arch=lst[1]
if 'HIP version' in line:
lst=line.split() lst=line.split()
if len(lst)>=37: #the line is complete hip_vers=lst[2]
tests.append(glue.join(lst[5:30])) if 'InstalledDir' in line:
kernels.append(glue.join(lst[37:])) lst=line.split()
tflops.append(lst[33]) rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
print("Number of tests:",len(tests))
print("Branch name:",branch_name) print("Branch name:",branch_name)
#sorted_tests = sorted(tests) print("Node name:",node_id)
#print("sorted tests:",sorted_tests) print("GPU_arch:",gpu_arch)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))] print("ROCM_version:",rocm_vers)
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] print("HIP_version:",hip_vers)
test_list=list(range(1,len(tests)+1))
#parse gemm performance tests:
if 'gemm' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
#parse resnet50 performance tests:
if 'resnet50' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
tflops.append(lst[4])
print("Number of tests:",len(tflops))
sql_hostname = '127.0.0.1' sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"] sql_username = os.environ["dbuser"]
print("sql_username=",sql_username)
sql_password = os.environ["dbpassword"] sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf' sql_main_database = 'miopen_perf'
sql_port = 3306 sql_port = 3306
ssh_host = os.environ["dbsship"] ssh_host = os.environ["dbsship"]
print("ssh_host=",ssh_host)
ssh_user = os.environ["dbsshuser"] ssh_user = os.environ["dbsshuser"]
print("ssh_user=",ssh_user)
ssh_port = int(os.environ["dbsshport"]) ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"] ssh_pass = os.environ["dbsshpassword"]
...@@ -118,75 +143,140 @@ def main(): ...@@ -118,75 +143,140 @@ def main():
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database)) format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect() conn = sqlEngine.connect()
#write the ck_gemm_test_params table #save gemm performance tests:
#only needed once the test set changes if 'gemm' in filename:
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))] #write the ck_gemm_test_params table
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))] #only needed once the test set changes
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))] '''
sorted_M = [x for _,x in sorted(zip(tests,M))] sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_N = [x for _,x in sorted(zip(tests,N))] sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_K = [x for _,x in sorted(zip(tests,K))] sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))] sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))] sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))] sorted_K = [x for _,x in sorted(zip(tests,K))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout, sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB, sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC] sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type', ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC']) sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
print(df) sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
dtypes = { 'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
'Test_number': Integer(), print(df)
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12), dtypes = {
'Blayout': NVARCHAR(length=12), 'Test_number': Integer(),
'M': Integer(), 'Data_type': NVARCHAR(length=5),
'N': Integer(), 'Alayout': NVARCHAR(length=12),
'K': Integer(), 'Blayout': NVARCHAR(length=12),
'StrideA': Integer(), 'M': Integer(),
'StrideB': Integer(), 'N': Integer(),
'StrideC': Integer() 'K': Integer(),
} 'StrideA': Integer(),
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes) 'StrideB': Integer(),
''' 'StrideC': Integer()
}
#read baseline results for the latest develop branch df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );''' '''
tflops_base = pd.read_sql_query(query, conn)
#read baseline results for the latest develop branch
#write new results to the db query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
testlist=[] tflops_base = pd.read_sql_query(query, conn)
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i) #write new results to the db
ck_gemm_tflops=[str(branch_name),str(datetime.datetime.now())] testlist=[]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Datetime']) for i in range(1,len(tests)+1):
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist) testlist.append("Test%i"%i)
flops=pd.concat([flops,df_add],axis=1) ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
print("new tflops results:",flops) flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False) df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops for gemm tests:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
#save resnet50 performance tests:
if 'resnet50' in filename:
#read baseline results for the latest develop branch
query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256 = pd.read_sql_query(query, conn)
query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4 = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,50):
testlist.append("Layer%i"%i)
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=256 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=4 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
conn.close() conn.close()
#compare the results to the baseline #compare the results to the baseline if baseline exists
regression=0 regression=0
base=tflops_base[testlist].to_numpy(dtype='float') if 'gemm' in filename:
base_list=base[0] if not tflops_base.empty:
ave_perf=0 base=tflops_base[testlist].to_numpy(dtype='float')
for i in range(len(base_list)): base_list=base[0]
# success criterion: ave_perf=0
if base_list[i]>1.01*float(sorted_tflops[i]): for i in range(len(base_list)):
print("test # ",i,"shows regression by {:.3f}%".format( # success criterion:
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100)) if base_list[i]>1.01*float(sorted_tflops[i]):
regression=1 print("test # ",i,"shows regression by {:.3f}%".format(
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i] (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
if regression==0: regression=1
print("no regressions found") ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
ave_perf=ave_perf/len(base_list) if regression==0:
print("average performance relative to baseline:",ave_perf) print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline")
if 'resnet50' in filename:
if not tflops_base_N256.empty:
base=tflops_base_N256[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=256")
if not tflops_base_N4.empty:
base=tflops_base_N4[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i+49]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i+49])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=4")
#return 0 if performance criteria met, otherwise return 1 #return 0 if performance criteria met, otherwise return 1
return regression return regression
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
## GPU visibility ## GPU visibility
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
make -j ckProfiler # make -j ckProfiler
DRIVER="./profiler/ckProfiler" DRIVER="../build/bin/ckProfiler"
OP=$1 OP=$1
DATATYPE=$2 DATATYPE=$2
...@@ -51,56 +51,56 @@ REPEAT=$9 ...@@ -51,56 +51,56 @@ REPEAT=$9
# Resnet50 from Bing # Resnet50 from Bing
#################### op____________________ datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads ####### op_________________ datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C_ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 3 7 7 224 224 2 2 1 1 3 3 3 3
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 64 64 3 3 56 56 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 64 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 256 1 1 56 56 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 56 56 2 2 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 128 128 3 3 28 28 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 128 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 512 1 1 28 28 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 28 28 2 2 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 1024 256 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 1024 1 1 14 14 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 14 14 2 2 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 2048 1 1 7 7 1 1 1 1 0 0 0 0
#profiler/ckProfiler conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1 $DRIVER conv_fwd_bias_relu $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 512 512 3 3 7 7 1 1 1 1 1 1 1 1
#profiler/ckProfiler conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0 $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $REPEAT $N 2048 512 1 1 7 7 1 1 1 1 0 0 0 0
# Resnet50 # Resnet50
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment