Commit 68886f7d authored by raman jana's avatar raman jana
Browse files

merging with latest develop branch

parents a9ee2960 1677cf70
......@@ -43,13 +43,14 @@ namespace profiler {
template <typename ADataType,
typename BDataType,
typename CDataType,
typename AccDataType,
typename ALayout,
typename BLayout,
typename CLayout>
void profile_grouped_gemm_impl(int do_verification,
int init_method,
bool do_log,
int nrepeat,
bool time_kernel,
const std::vector<int>& Ms,
const std::vector<int>& Ns,
const std::vector<int>& Ks,
......@@ -231,7 +232,8 @@ void profile_grouped_gemm_impl(int do_verification,
{
std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_shapes.size(); i++)
......@@ -270,6 +272,7 @@ void profile_grouped_gemm_impl(int do_verification,
ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
......
......@@ -5,74 +5,77 @@
#include "device_reduce_instance.hpp"
#include "reduction_enums.hpp"
#include "host_reduction.hpp"
#include "host_common_util.hpp"
#include "host_tensor_generator.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
template <int Rank, int NumReduceDim, int ReduceOpId, int NanOpt, int IndicesOpt>
template <int Rank, int NumReduceDim, int ReduceOpId, bool PropagateNan, bool UseIndex>
struct ReduceDescription
{
static constexpr int Rank_ = Rank;
static constexpr int NumReduceDim_ = NumReduceDim;
static constexpr int ReduceOpId_ = ReduceOpId;
static constexpr int NanOpt_ = NanOpt;
static constexpr int IndicesOpt_ = IndicesOpt;
static constexpr int PropagateNan_ = PropagateNan;
static constexpr int UseIndex_ = UseIndex;
};
using reduce_description_instances = std::tuple<ReduceDescription<4, 3, 0, 0, 0>, // for ADD
ReduceDescription<4, 4, 0, 0, 0>,
ReduceDescription<4, 1, 0, 0, 0>,
ReduceDescription<2, 1, 0, 0, 0>,
ReduceDescription<4, 3, 5, 0, 0>, // for AVG
ReduceDescription<4, 4, 5, 0, 0>,
ReduceDescription<4, 1, 5, 0, 0>,
ReduceDescription<2, 1, 5, 0, 0>,
ReduceDescription<4, 3, 7, 0, 0>, // for NORM2
ReduceDescription<4, 4, 7, 0, 0>,
ReduceDescription<4, 1, 7, 0, 0>,
ReduceDescription<2, 1, 7, 0, 0>,
ReduceDescription<4, 3, 2, 0, 0>, // for MIN
ReduceDescription<4, 4, 2, 0, 0>,
ReduceDescription<4, 1, 2, 0, 0>,
ReduceDescription<2, 1, 2, 0, 0>,
ReduceDescription<4, 3, 3, 0, 0>, // for MAX
ReduceDescription<4, 4, 3, 0, 0>,
ReduceDescription<4, 1, 3, 0, 0>,
ReduceDescription<2, 1, 3, 0, 0>,
ReduceDescription<4, 3, 4, 0, 0>, // for AMAX
ReduceDescription<4, 4, 4, 0, 0>,
ReduceDescription<4, 1, 4, 0, 0>,
ReduceDescription<2, 1, 4, 0, 0>,
ReduceDescription<4, 3, 2, 0, 1>, // for MIN
ReduceDescription<4, 4, 2, 0, 1>,
ReduceDescription<4, 1, 2, 0, 1>,
ReduceDescription<2, 1, 2, 0, 1>,
ReduceDescription<4, 3, 3, 0, 1>, // for MAX
ReduceDescription<4, 4, 3, 0, 1>,
ReduceDescription<4, 1, 3, 0, 1>,
ReduceDescription<2, 1, 3, 0, 1>,
ReduceDescription<4, 3, 4, 0, 1>, // for AMAX
ReduceDescription<4, 4, 4, 0, 1>,
ReduceDescription<4, 1, 4, 0, 1>,
ReduceDescription<2, 1, 4, 0, 1>>;
using reduce_description_instances =
std::tuple<ReduceDescription<4, 3, 0, false, false>, // for ADD
ReduceDescription<4, 4, 0, false, false>,
ReduceDescription<4, 1, 0, false, false>,
ReduceDescription<2, 1, 0, false, false>,
ReduceDescription<4, 3, 5, false, false>, // for AVG
ReduceDescription<4, 4, 5, false, false>,
ReduceDescription<4, 1, 5, false, false>,
ReduceDescription<2, 1, 5, false, false>,
ReduceDescription<4, 3, 7, false, false>, // for NORM2
ReduceDescription<4, 4, 7, false, false>,
ReduceDescription<4, 1, 7, false, false>,
ReduceDescription<2, 1, 7, false, false>,
ReduceDescription<4, 3, 2, false, false>, // for MIN
ReduceDescription<4, 4, 2, false, false>,
ReduceDescription<4, 1, 2, false, false>,
ReduceDescription<2, 1, 2, false, false>,
ReduceDescription<4, 3, 3, false, false>, // for MAX
ReduceDescription<4, 4, 3, false, false>,
ReduceDescription<4, 1, 3, false, false>,
ReduceDescription<2, 1, 3, false, false>,
ReduceDescription<4, 3, 4, false, false>, // for AMAX
ReduceDescription<4, 4, 4, false, false>,
ReduceDescription<4, 1, 4, false, false>,
ReduceDescription<2, 1, 4, false, false>,
ReduceDescription<4, 3, 2, false, true>, // for MIN
ReduceDescription<4, 4, 2, false, true>,
ReduceDescription<4, 1, 2, false, true>,
ReduceDescription<2, 1, 2, false, true>,
ReduceDescription<4, 3, 3, false, true>, // for MAX
ReduceDescription<4, 4, 3, false, true>,
ReduceDescription<4, 1, 3, false, true>,
ReduceDescription<2, 1, 3, false, true>,
ReduceDescription<4, 3, 4, false, true>, // for AMAX
ReduceDescription<4, 4, 4, false, true>,
ReduceDescription<4, 1, 4, false, true>,
ReduceDescription<2, 1, 4, false, true>>;
template <typename DescriptionType>
bool description_match(const DescriptionType& description,
int Rank,
const std::vector<int>& reduceDims,
ReduceTensorOp ReduceOpId,
NanPropagation NanOpt,
ReduceTensorIndices IndicesOpt)
bool PropagateNan,
bool UseIndex)
{
if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast<int>(ReduceOpId) ||
description.NanOpt_ != static_cast<int>(NanOpt) ||
description.IndicesOpt_ != static_cast<int>(IndicesOpt))
description.PropagateNan_ != static_cast<int>(PropagateNan) ||
description.UseIndex_ != static_cast<int>(UseIndex))
return (false);
if(DescriptionType::NumReduceDim_ != reduceDims.size())
......@@ -116,48 +119,18 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
return invariantDims;
};
template <typename T>
static void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
{
std::ofstream outFile(fileName, std::ios::binary);
if(outFile)
{
outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
outFile.close();
std::cout << "Write output to file " << fileName << std::endl;
}
else
{
std::cout << "Could not open file " << fileName << " for writing" << std::endl;
}
};
// map the data type used by the GPU kernels to the corresponding type used by the host codes
template <typename InType>
struct type_mapping
{
using OutType = InType;
};
template <>
struct type_mapping<ck::half_t>
{
using OutType = half_float::half;
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
int Rank,
int NumReduceDim,
ReduceTensorOp ReduceOpId,
NanPropagation NanOpt,
ReduceTensorIndices IndicesOpt>
void profile_reduce_impl_impl(bool do_verification,
bool PropagateNan,
bool UseIndex>
bool profile_reduce_impl_impl(bool do_verification,
int init_method,
bool do_log,
bool do_dumpout,
int nrepeat,
bool time_kernel,
const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims,
float alpha,
......@@ -165,16 +138,13 @@ void profile_reduce_impl_impl(bool do_verification,
{
using namespace ck::tensor_operation::device;
using namespace ck::tensor_operation::device::device_reduce_instance;
using namespace ck::host_reduce;
using ck::host_common::dumpBufferToFile;
constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX);
constexpr bool NeedIndices =
(op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
constexpr bool PropagateNan = (NanOpt == NanPropagation::PROPAGATE_NAN);
constexpr bool OutputIndex = (op_support_indices && UseIndex);
constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
constexpr bool op_support_atomic_add =
......@@ -195,8 +165,7 @@ void profile_reduce_impl_impl(bool do_verification,
(op_support_indices && !std::is_same<AccDataType, float>::value);
// 1) The indices can only be used when the reduction operation is indexable
constexpr bool invalid_reduce_3 =
(!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool invalid_reduce_3 = (!op_support_indices && UseIndex);
// 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
// 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
......@@ -219,6 +188,8 @@ void profile_reduce_impl_impl(bool do_verification,
constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
bool pass = true;
if constexpr(!invalid_reduce)
{
Tensor<InDataType> in(inLengths);
......@@ -282,42 +253,26 @@ void profile_reduce_impl_impl(bool do_verification,
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0;
DeviceMem out_indices_dev(indicesSizeInBytes);
float best_avg_time = 0;
float best_gb_per_sec = 0;
using InElementwiseOperation_0 =
using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
InElementwiseOperation;
using AccElementwiseOperation_0 =
using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::
AccElementwiseOperation;
using InElementwiseOperation_1 =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
InElementwiseOperation;
using AccElementwiseOperation_1 =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, false>::
AccElementwiseOperation;
using InElementwiseOperation_2 =
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
InElementwiseOperation;
using AccElementwiseOperation_2 =
typename reduce_unary_operator<AccDataType, ReduceOpId, false, true>::
AccElementwiseOperation;
using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using DeviceReduceInstPtr0 =
DeviceReducePtr<InElementwiseOperation_0, AccElementwiseOperation_0>;
using DeviceReduceInstPtr1 =
DeviceReducePtr<InElementwiseOperation_1, AccElementwiseOperation_1>;
using DeviceReduceInstPtr2 =
DeviceReducePtr<InElementwiseOperation_2, AccElementwiseOperation_2>;
DeviceReducePtr<InElementwiseOperation, AccElementwiseOperation>;
std::vector<DeviceReduceInstPtr0> reduce0_ptrs;
std::vector<DeviceReduceInstPtr1> reduce1_ptrs;
std::vector<DeviceReduceInstPtr2> reduce2_ptrs;
add_device_reduce_instance_threadwise<InDataType,
AccDataType,
......@@ -325,8 +280,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce0_ptrs);
PropagateNan,
UseIndex>(reduce0_ptrs);
add_device_reduce_instance_blockwise<InDataType,
AccDataType,
......@@ -334,8 +289,8 @@ void profile_reduce_impl_impl(bool do_verification,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce0_ptrs);
PropagateNan,
UseIndex>(reduce0_ptrs);
if constexpr(use_atomic_add)
{
......@@ -345,35 +300,11 @@ void profile_reduce_impl_impl(bool do_verification,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce0_ptrs);
PropagateNan,
UseIndex>(reduce0_ptrs);
}
else
{
add_device_reduce_instance_multiblock_partial_reduce<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce1_ptrs);
};
// used for secondary reduction
if constexpr(!use_atomic_add)
{
add_device_reduce_instance_blockwise_second_call<AccDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOpId,
NanOpt,
IndicesOpt>(reduce2_ptrs);
};
if(reduce0_ptrs.empty() && reduce1_ptrs.empty())
if(reduce0_ptrs.empty())
{
throw std::runtime_error("Wrong! No device REDUCE instance found");
};
......@@ -383,31 +314,34 @@ void profile_reduce_impl_impl(bool do_verification,
ReductionHost<InDataType,
AccDataType,
OutDataType,
ReduceOpId,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
Rank,
NumReduceDim,
PropagateNan,
NeedIndices>
OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
};
const auto i_inLengths = to_int_vector(inLengths);
const auto i_inStrides = to_int_vector(inStrides);
const auto i_outLengths = to_int_vector(outLengths);
const auto i_outStrides = to_int_vector(outStrides);
std::vector<ck::index_t> i_inLengths;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
for(auto& reduce_ptr : reduce0_ptrs)
{
auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
DeviceMem ws_dev(wsSizeInBytes);
InElementwiseOperation_0 in_elementwise_op_0(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_0 acc_elementwise_op_0(
static_cast<int32_t>(reduce_total_length));
InElementwiseOperation in_elementwise_op(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation acc_elementwise_op(static_cast<int32_t>(reduce_total_length));
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides,
......@@ -417,11 +351,11 @@ void profile_reduce_impl_impl(bool do_verification,
alpha,
beta,
in_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
out_indices_dev.GetDeviceBuffer(),
ws_dev.GetDeviceBuffer(),
in_elementwise_op_0,
acc_elementwise_op_0);
in_elementwise_op,
acc_elementwise_op);
if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
continue;
......@@ -430,7 +364,8 @@ void profile_reduce_impl_impl(bool do_verification,
auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes =
invariant_total_length * reduce_total_length * sizeof(InDataType) +
......@@ -438,8 +373,9 @@ void profile_reduce_impl_impl(bool do_verification,
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
<< std::endl;
if(time_kernel)
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< reduce_name << std::endl;
if(gb_per_sec > best_gb_per_sec)
{
......@@ -449,22 +385,24 @@ void profile_reduce_impl_impl(bool do_verification,
if(do_verification)
{
bool single_pass;
out_dev.FromDevice(out.mData.data());
ck::utils::check_err(out.mData, out_ref.mData);
single_pass = ck::utils::check_err(out.mData, out_ref.mData);
if(NeedIndices)
if(OutputIndex)
{
out_indices_dev.FromDevice(out_indices.mData.data());
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
;
single_pass = single_pass &&
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
};
if(do_log)
if(!single_pass)
{
LogRangeAsType<float>(std::cout << "out_host : ", out_ref.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",") << std::endl;
};
std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
}
pass = pass && single_pass;
};
if(do_dumpout)
......@@ -473,7 +411,7 @@ void profile_reduce_impl_impl(bool do_verification,
dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
dumpBufferToFile(
"dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
if(NeedIndices)
if(OutputIndex)
{
dumpBufferToFile("dump_indices.bin",
out_indices.mData.data(),
......@@ -485,156 +423,34 @@ void profile_reduce_impl_impl(bool do_verification,
};
};
for(auto& reduce_ptr : reduce1_ptrs)
{
auto wsSizeInBytes = reduce_ptr->GetWorkspaceSizeInBytes(i_inLengths, reduceDims);
DeviceMem ws_dev(wsSizeInBytes);
InElementwiseOperation_1 in_elementwise_op_1(static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_1 acc_elementwise_op_1(
static_cast<int32_t>(reduce_total_length));
auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer(),
out_indices_dev.GetDeviceBuffer(),
ws_dev.GetDeviceBuffer(),
in_elementwise_op_1,
acc_elementwise_op_1);
if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
continue;
std::string reduce_name = reduce_ptr->GetTypeString();
auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
std::size_t num_bytes =
invariant_total_length * reduce_total_length * sizeof(InDataType) +
invariant_total_length * sizeof(OutDataType);
std::vector<int> inLengths2 = reduce_ptr->GetWorkspace2dLengths(argument_ptr.get());
std::vector<int> inStrides2{inLengths2[1], 1};
for(auto& reduce2_ptr : reduce2_ptrs)
{
InElementwiseOperation_2 in_elementwise_op_2(
static_cast<int32_t>(reduce_total_length));
AccElementwiseOperation_2 acc_elementwise_op_2(
static_cast<int32_t>(reduce_total_length));
auto argument2_ptr =
reduce2_ptr->MakeArgumentPointer(inLengths2,
inStrides2,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
ws_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer(),
out_indices_dev.GetDeviceBuffer(),
ws_dev.GetDeviceBuffer(),
in_elementwise_op_2,
acc_elementwise_op_2);
if(!reduce2_ptr->IsSupportedArgument(argument2_ptr.get()))
continue;
std::string reduce2_name = reduce2_ptr->GetTypeString();
auto invoker2_ptr = reduce2_ptr->MakeInvokerPointer();
float avg_time_2 = invoker2_ptr->Run(argument2_ptr.get(), nrepeat);
std::size_t num_bytes_2 =
static_cast<size_t>(inLengths2[0]) * inLengths2[1] * sizeof(AccDataType);
float gb_per_sec = (num_bytes + num_bytes_2) / 1.E6 / (avg_time + avg_time_2);
std::cout << "Perf: " << (avg_time + avg_time_2) << " ms, " << gb_per_sec
<< " GB/s, " << reduce_name << " => " << reduce2_name << std::endl;
if(gb_per_sec > best_gb_per_sec)
{
best_avg_time = avg_time + avg_time_2;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
out_dev.FromDevice(out.mData.data());
ck::utils::check_err(out.mData, out_ref.mData);
if(NeedIndices)
{
out_indices_dev.FromDevice(out_indices.mData.data());
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
;
};
if(do_log)
{
LogRangeAsType<float>(std::cout << "out_host : ", out_ref.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out.mData, ",")
<< std::endl;
}
}
if(do_dumpout)
{
dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
dumpBufferToFile(
"dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
if(NeedIndices)
{
dumpBufferToFile("dump_indices.bin",
out_indices.mData.data(),
out_indices.mDesc.GetElementSize());
dumpBufferToFile("dump_indices_host.bin",
out_indices_ref.mData.data(),
out_indices_ref.mDesc.GetElementSize());
};
};
};
};
std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
<< std::endl;
if(time_kernel)
std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
<< std::endl;
}
else
{
std::cout << "The requested reduction operation is not supported, please check !!!"
<< std::endl;
};
return pass;
};
template <typename InDataType, typename AccDataType, typename OutDataType>
void profile_reduce_impl(bool do_verification,
bool profile_reduce_impl(bool do_verification,
int init_method,
bool do_log,
bool do_dumpout,
int nrepeat,
bool time_kernel,
const std::vector<size_t>& inLengths,
const std::vector<int>& reduceDims,
ReduceTensorOp ReduceOpId,
NanPropagation NanOpt,
ReduceTensorIndices IndicesOpt,
bool PropagateNan,
bool UseIndex,
float alpha,
float beta)
{
bool matched = false;
bool pass = true;
using tuple_of_description_instances =
tensor_operation::device::device_reduce_instance::reduce_description_instances;
......@@ -648,29 +464,30 @@ void profile_reduce_impl(bool do_verification,
using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
if(!description_match(
descType{}, inLengths.size(), reduceDims, ReduceOpId, NanOpt, IndicesOpt))
descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
return;
profile_reduce_impl_impl<InDataType,
AccDataType,
OutDataType,
descType::Rank_,
descType::NumReduceDim_,
static_cast<ReduceTensorOp>(descType::ReduceOpId_),
static_cast<NanPropagation>(descType::NanOpt_),
static_cast<ReduceTensorIndices>(descType::IndicesOpt_)>(
do_verification,
init_method,
do_log,
do_dumpout,
nrepeat,
inLengths,
reduceDims,
alpha,
beta);
pass = pass &&
profile_reduce_impl_impl<InDataType,
AccDataType,
OutDataType,
descType::Rank_,
descType::NumReduceDim_,
static_cast<ReduceTensorOp>(descType::ReduceOpId_),
static_cast<bool>(descType::PropagateNan_),
static_cast<bool>(descType::UseIndex_)>(do_verification,
init_method,
do_dumpout,
time_kernel,
inLengths,
reduceDims,
alpha,
beta);
matched = true;
});
return pass;
};
} // namespace profiler
......
......@@ -48,8 +48,8 @@ int profile_batched_gemm(int argc, char* argv[])
printf(" 3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
exit(1);
}
......@@ -59,7 +59,7 @@ int profile_batched_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -82,7 +82,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -102,7 +102,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -122,7 +122,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -142,7 +142,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -162,7 +162,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -182,7 +182,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -202,7 +202,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -222,7 +222,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -242,7 +242,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -262,7 +262,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -282,7 +282,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -302,7 +302,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -322,7 +322,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -342,7 +342,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -362,7 +362,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -382,7 +382,7 @@ int profile_batched_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -396,5 +396,5 @@ int profile_batched_gemm(int argc, char* argv[])
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -33,8 +33,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
printf("arg15: split k into mulitiple batch\n");
exit(1);
......@@ -45,7 +45,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -69,7 +69,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -91,7 +91,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -113,7 +113,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -135,7 +135,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -149,5 +149,5 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "profile_conv_bwd_data_impl.hpp"
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvInputLayout
{
NCHW, // 0
NHWC, // 1
};
enum struct ConvWeightLayout
{
KCYX, // 0
KYXC, // 1
};
enum struct ConvOutputLayout
{
NKHW, // 0
NHWK, // 1
};
int profile_conv_bwd_data(int argc, char* argv[])
{
if(argc != 25)
{
printf("arg1: tensor operation (conv_bwd: BackwardConvolution)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
const ck::index_t C = std::stoi(argv[12]);
const ck::index_t Y = std::stoi(argv[13]);
const ck::index_t X = std::stoi(argv[14]);
const ck::index_t Hi = std::stoi(argv[15]);
const ck::index_t Wi = std::stoi(argv[16]);
const ck::index_t conv_stride_h = std::stoi(argv[17]);
const ck::index_t conv_stride_w = std::stoi(argv[18]);
const ck::index_t conv_dilation_h = std::stoi(argv[19]);
const ck::index_t conv_dilation_w = std::stoi(argv[20]);
const ck::index_t in_left_pad_h = std::stoi(argv[21]);
const ck::index_t in_left_pad_w = std::stoi(argv[22]);
const ck::index_t in_right_pad_h = std::stoi(argv[23]);
const ck::index_t in_right_pad_w = std::stoi(argv[24]);
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_bwd_data_impl<2,
float,
float,
float,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_bwd_data_impl<2,
ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_bwd_data_impl<2,
uint16_t,
uint16_t,
uint16_t,
float,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
ck::profiler::profile_conv_bwd_data_impl<2,
int8_t,
int8_t,
int8_t,
int32_t,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
nrepeat,
N,
K,
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
}
else
{
throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
}
return 1;
}
......@@ -58,7 +58,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
......@@ -98,7 +98,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
N,
K,
C,
......@@ -124,7 +124,7 @@ int profile_conv_bwd_weight(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
N,
K,
C,
......@@ -142,5 +142,5 @@ int profile_conv_bwd_weight(int argc, char* argv[])
throw std::runtime_error("wrong! this Conv data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -42,7 +42,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
......@@ -55,7 +55,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
......@@ -93,7 +93,7 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
N,
K,
C,
......@@ -110,5 +110,5 @@ int profile_conv_fwd_bias_relu(int argc, char* argv[])
throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
}
return 1;
return 0;
}
......@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
......@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
......@@ -94,7 +94,7 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
N,
K,
C,
......@@ -111,5 +111,5 @@ int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
}
return 1;
return 0;
}
......@@ -43,7 +43,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(1);
......@@ -56,7 +56,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const bool time_kernel = std::stoi(argv[9]);
const ck::index_t N = std::stoi(argv[10]);
const ck::index_t K = std::stoi(argv[11]);
......@@ -95,7 +95,7 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
N,
K,
C,
......@@ -112,5 +112,5 @@ int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
}
return 1;
return 0;
}
......@@ -95,7 +95,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
return 1;
......@@ -108,7 +108,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const int nrepeat = std::stoi(argv[9]);
const bool time_kernel = std::stoi(argv[9]);
ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
......@@ -132,7 +132,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
params.N_,
params.K_,
params.C_,
......@@ -157,7 +157,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
params.N_,
params.K_,
params.C_,
......@@ -182,7 +182,7 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
params.N_,
params.K_,
params.C_,
......
......@@ -119,7 +119,7 @@ template <int NDim,
void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
int nrepeat,
bool time_kernel,
int init_method,
ConvLayouts)
{
......@@ -185,7 +185,7 @@ void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
reference_conv_fwd_fun);
auto best_conf = run_engine.Profile(
conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
nrepeat,
time_kernel,
do_verification,
do_log);
......@@ -201,7 +201,7 @@ void profile_convnd_instances(ConvDataType data_type,
const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
int nrepeat,
bool time_kernel,
int init_method)
{
switch(data_layout)
......@@ -214,7 +214,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
......@@ -223,7 +223,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
......@@ -232,7 +232,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
......@@ -241,7 +241,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
......@@ -256,7 +256,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
......@@ -265,7 +265,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
......@@ -274,7 +274,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
......@@ -283,7 +283,7 @@ void profile_convnd_instances(ConvDataType data_type,
params,
do_verification,
do_log,
nrepeat,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
......@@ -304,7 +304,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
bool do_verification{true};
int init_method{2};
bool do_log{false};
int nrepeat{100};
bool time_kernel{false};
int num_dim_spatial{2};
ConvParams params;
......@@ -318,7 +318,7 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
do_verification = std::stoi(argv[4]);
init_method = std::stoi(argv[5]);
do_log = std::stoi(argv[6]);
nrepeat = std::stoi(argv[7]);
time_kernel = std::stoi(argv[7]);
num_dim_spatial = std::stoi(argv[8]);
}
if(argc >= 10)
......@@ -332,20 +332,20 @@ int ck::profiler::profile_convnd_fwd(int argc, char* argv[])
{
case 1:
profile_convnd_instances<1>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
case 2:
profile_convnd_instances<2>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
case 3:
profile_convnd_instances<3>(
data_type, data_layout, params, do_verification, do_log, nrepeat, init_method);
data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
default:
throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
std::to_string(num_dim_spatial));
}
return 1;
return 0;
}
......@@ -38,8 +38,8 @@ int profile_gemm(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
......@@ -50,7 +50,7 @@ int profile_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -68,13 +68,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -88,13 +89,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -108,13 +110,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -128,13 +131,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -146,6 +150,7 @@ int profile_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_gemm_impl<float,
float,
float,
float,
ck::tensor_layout::gemm::RowMajor,
......@@ -154,7 +159,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -166,6 +171,7 @@ int profile_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_gemm_impl<float,
float,
float,
float,
ck::tensor_layout::gemm::RowMajor,
......@@ -174,7 +180,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -186,6 +192,7 @@ int profile_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_gemm_impl<float,
float,
float,
float,
ck::tensor_layout::gemm::ColumnMajor,
......@@ -194,7 +201,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -206,6 +213,7 @@ int profile_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_gemm_impl<float,
float,
float,
float,
ck::tensor_layout::gemm::ColumnMajor,
......@@ -214,7 +222,7 @@ int profile_gemm(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -228,13 +236,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<int8_t,
int8_t,
int8_t,
int32_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -248,13 +257,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<int8_t,
int8_t,
int8_t,
int32_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -268,13 +278,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<int8_t,
int8_t,
int8_t,
int32_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -288,13 +299,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<int8_t,
int8_t,
int8_t,
int32_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -308,13 +320,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::bhalf_t,
ck::bhalf_t,
ck::bhalf_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -328,13 +341,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::bhalf_t,
ck::bhalf_t,
ck::bhalf_t,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -348,13 +362,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::bhalf_t,
ck::bhalf_t,
ck::bhalf_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -368,13 +383,14 @@ int profile_gemm(int argc, char* argv[])
ck::profiler::profile_gemm_impl<ck::bhalf_t,
ck::bhalf_t,
ck::bhalf_t,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -388,5 +404,5 @@ int profile_gemm(int argc, char* argv[])
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -36,8 +36,8 @@ int profile_gemm_bias_2d(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: alpha\n");
printf("arg15: beta\n");
......@@ -50,7 +50,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -76,7 +76,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -99,7 +99,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -122,7 +122,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -145,7 +145,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -168,7 +168,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -191,7 +191,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -214,7 +214,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -237,7 +237,7 @@ int profile_gemm_bias_2d(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -252,5 +252,5 @@ int profile_gemm_bias_2d(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -36,8 +36,8 @@ int profile_gemm_bias_relu(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
......@@ -48,7 +48,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -69,7 +69,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -88,7 +88,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -107,7 +107,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -126,7 +126,7 @@ int profile_gemm_bias_relu(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -139,5 +139,5 @@ int profile_gemm_bias_relu(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -36,8 +36,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
printf("arg15: split k into mulitiple batch\n");
exit(1);
......@@ -48,7 +48,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -70,7 +70,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -90,7 +90,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -110,7 +110,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -130,7 +130,7 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -144,5 +144,5 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -32,8 +32,8 @@ int profile_gemm_reduce(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
printf("arg14: split k into mulitiple batch\n");
exit(1);
......@@ -44,7 +44,7 @@ int profile_gemm_reduce(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
......@@ -66,7 +66,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -87,7 +87,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -108,7 +108,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -129,7 +129,7 @@ int profile_gemm_reduce(int argc, char* argv[])
do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
M,
N,
K,
......@@ -142,5 +142,5 @@ int profile_gemm_reduce(int argc, char* argv[])
throw std::runtime_error("wrong! this data_type & layout is not implemented");
}
return 1;
return 0;
}
......@@ -54,8 +54,8 @@ int profile_grouped_gemm(int argc, char* argv[])
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg7: run kernel # of times (>1)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)\n");
exit(1);
......@@ -66,7 +66,7 @@ int profile_grouped_gemm(int argc, char* argv[])
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const int nrepeat = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[7]);
const auto Ms = argToIntArray(argv[8]);
const auto Ns = argToIntArray(argv[9]);
......@@ -79,6 +79,7 @@ int profile_grouped_gemm(int argc, char* argv[])
if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
......@@ -86,7 +87,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
Ms,
Ns,
Ks,
......@@ -97,6 +98,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::RowMajor,
......@@ -104,7 +106,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
Ms,
Ns,
Ks,
......@@ -115,6 +117,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
......@@ -122,7 +125,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
Ms,
Ns,
Ks,
......@@ -133,6 +136,7 @@ int profile_grouped_gemm(int argc, char* argv[])
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_grouped_gemm_impl<ck::half_t,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_layout::gemm::ColumnMajor,
......@@ -140,7 +144,7 @@ int profile_grouped_gemm(int argc, char* argv[])
ck::tensor_layout::gemm::RowMajor>(do_verification,
init_method,
do_log,
nrepeat,
time_kernel,
Ms,
Ns,
Ks,
......@@ -153,5 +157,5 @@ int profile_grouped_gemm(int argc, char* argv[])
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 1;
return 0;
}
#include <iostream>
#include <fstream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <vector>
#include <stdexcept>
#include <sstream>
#include <getopt.h>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "data_type_enum.hpp"
#include "reduction_enums.hpp"
#include "host_common_util.hpp"
#include "profile_reduce_impl.hpp"
using namespace std;
using ck::NanPropagation;
using ck::ReduceTensorIndices;
using ck::ReduceTensorOp;
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
......@@ -38,63 +30,9 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
{"bf16", no_argument, nullptr, '?'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"log", required_argument, nullptr, 'l'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
template <typename T>
static T getSingleValueFromString(const string& valueStr)
{
std::istringstream iss(valueStr);
T val;
iss >> val;
return (val);
};
template <typename T>
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);
std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;
new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};
std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);
values.push_back(val);
return (values);
}
enum struct AppDataType
{
appHalf = 0,
appFloat = 1,
appInt32 = 2,
appInt8 = 3,
appInt8x4 = 4,
appBFloat16 = 5,
appDouble = 6,
};
static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
{
for(auto dim : reduceDims)
......@@ -113,7 +51,7 @@ static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims
};
};
class AppArgs
class ReduceProfilerArgs
{
private:
int option_index = 0;
......@@ -130,26 +68,23 @@ class AppArgs
std::vector<float> scales;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
AppDataType compTypeId = AppDataType::appFloat;
AppDataType outTypeId = AppDataType::appFloat;
ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
ck::DataTypeEnum outTypeId = ck::DataTypeEnum::Float;
bool compType_assigned = false;
bool outType_assigned = false;
NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN;
ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
bool do_log = false;
bool do_verification = false;
bool do_dumpout = false;
int nanOpt = 0;
int indicesOpt = 0;
bool do_verification = false;
bool do_dumpout = false;
int init_method;
int nrepeat;
bool time_kernel;
bool need_indices = false;
AppArgs() = default;
~AppArgs() = default;
ReduceProfilerArgs() = default;
~ReduceProfilerArgs() = default;
void show_usage(const char* cmd)
{
......@@ -166,8 +101,11 @@ class AppArgs
std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
"output, which could be float when the input data is half"
<< std::endl;
std::cout << "--nanOpt or -N, enum value indicates the selection for NanOpt" << std::endl;
std::cout << "--indicesOpt or -I, enum value indicates the selection for IndicesOpt"
std::cout
<< "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
<< std::endl;
std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
"index in reduction"
<< std::endl;
std::cout << "--scales or -S, comma separated two float values for alpha and beta"
<< std::endl;
......@@ -181,18 +119,19 @@ class AppArgs
std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
"for further analysis"
<< std::endl;
std::cout << "--log or -l, 1/0 to indicate whether to log some information" << std::endl;
};
int processArgs(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the "reduce" module name
while(1)
{
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:l:", long_options, &option_index);
ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
......@@ -219,27 +158,27 @@ class AppArgs
if(!optarg)
throw std::runtime_error("Invalid option format!");
compTypeId = static_cast<AppDataType>(std::atoi(optarg));
compTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
compType_assigned = true;
break;
case 'W':
if(!optarg)
throw std::runtime_error("Invalid option format!");
outTypeId = static_cast<AppDataType>(std::atoi(optarg));
outTypeId = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
outType_assigned = true;
break;
case 'N':
if(!optarg)
throw std::runtime_error("Invalid option format!");
nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
nanOpt = std::atoi(optarg);
break;
case 'I':
if(!optarg)
throw std::runtime_error("Invalid option format!");
indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
indicesOpt = std::atoi(optarg);
break;
case 'S':
if(!optarg)
......@@ -262,12 +201,6 @@ class AppArgs
do_dumpout = static_cast<bool>(std::atoi(optarg));
break;
case 'l':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_log = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "half")
use_half = true;
......@@ -295,7 +228,7 @@ class AppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
init_method = std::atoi(argv[optind++]);
nrepeat = std::atoi(argv[optind]);
time_kernel = static_cast<bool>(std::atoi(argv[optind]));
if(scales.empty())
{
......@@ -306,9 +239,6 @@ class AppArgs
if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
reduceOp == ReduceTensorOp::AMAX)
{
if(indicesOpt != ReduceTensorIndices::NO_INDICES)
need_indices = true;
// for indexable operations, no need to assign compType and outType, just let them be
// same as inType
compType_assigned = false;
......@@ -322,9 +252,10 @@ class AppArgs
int profile_reduce(int argc, char* argv[])
{
using namespace ck::profiler;
using ck::DataTypeEnum;
using ck::profiler::profile_reduce_impl;
AppArgs args;
ReduceProfilerArgs args;
if(args.processArgs(argc, argv) < 0)
return (-1);
......@@ -339,42 +270,41 @@ int profile_reduce(int argc, char* argv[])
if(args.use_half)
{
if(!args.compType_assigned)
args.compTypeId = AppDataType::appHalf;
args.compTypeId = DataTypeEnum::Half;
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
args.outTypeId = AppDataType::appFloat;
(args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appHalf;
args.outTypeId = DataTypeEnum::Half;
if(args.compTypeId == AppDataType::appHalf)
if(args.compTypeId == DataTypeEnum::Half)
{
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
args.scales[0],
args.scales[1]);
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
args.do_verification,
args.init_method,
args.do_dumpout,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appFloat)
else if(args.compTypeId == DataTypeEnum::Float)
{
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
......@@ -385,56 +315,53 @@ int profile_reduce(int argc, char* argv[])
{
profile_reduce_impl<double, double, double>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.use_int8)
{
if(!args.compType_assigned)
args.compTypeId = AppDataType::appInt8;
args.compTypeId = DataTypeEnum::Int8;
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
args.outTypeId = AppDataType::appInt32;
(args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
args.outTypeId = DataTypeEnum::Int32;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appInt8;
args.outTypeId = DataTypeEnum::Int8;
if(args.compTypeId == AppDataType::appInt8)
if(args.compTypeId == DataTypeEnum::Int8)
{
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appInt32)
else if(args.compTypeId == DataTypeEnum::Int32)
{
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
......@@ -444,54 +371,51 @@ int profile_reduce(int argc, char* argv[])
else if(args.use_bf16)
{
if(args.outType_assigned &&
(args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
args.outTypeId = AppDataType::appFloat;
(args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
args.outTypeId = DataTypeEnum::Float;
if(!args.outType_assigned)
args.outTypeId = AppDataType::appBFloat16;
args.outTypeId = DataTypeEnum::BFloat16;
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else
{
if(args.compTypeId == AppDataType::appFloat)
if(args.compTypeId == DataTypeEnum::Float)
{
profile_reduce_impl<float, float, float>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
else if(args.compTypeId == AppDataType::appDouble)
else if(args.compTypeId == DataTypeEnum::Double)
{
profile_reduce_impl<float, double, float>(args.do_verification,
args.init_method,
args.do_log,
args.do_dumpout,
args.nrepeat,
args.time_kernel,
args.inLengths,
args.reduceDims,
args.reduceOp,
args.nanOpt,
args.indicesOpt,
static_cast<bool>(args.nanOpt),
static_cast<bool>(args.indicesOpt),
args.scales[0],
args.scales[1]);
}
......
......@@ -13,6 +13,7 @@ int profile_gemm_bias_relu_add(int, char*[]);
int profile_gemm_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
......@@ -53,7 +54,7 @@ int main(int argc, char* argv[])
}
else if(strcmp(argv[1], "grouped_gemm") == 0)
{
profile_grouped_gemm(argc, argv);
return profile_grouped_gemm(argc, argv);
}
else if(strcmp(argv[1], "conv_fwd") == 0)
{
......@@ -107,7 +108,7 @@ int main(int argc, char* argv[])
" conv1d_bwd_data: BackwardConvolution data 1 dim\n"
" conv2d_bwd_data: BackwardConvolution data 2 dim\n"
" conv3d_bwd_data: BackwardConvolution data 3 dim\n"
" reduce: REDUCE\n"
" reduce: Reduce\n"
" conv2d_bwd_weight: Backward Weight Convolution 2d\n");
// clang-format on
}
......
#!/usr/bin/env python3
import os, io
import argparse
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
results = []
#parse results
glue=""
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
results.append(print_to_string(glue.join(lst[8:]),lst[4]))
#sort results
#read baseline results for the latest develop branch
#write new results to the db
#compare the results to the baseline
#return 0 if performance criteria met, otherwise return 1
print(results)
return 0
if __name__ == '__main__':
#!/usr/bin/env python3
import os, io, argparse, datetime, re
import numpy as np
import sqlalchemy
from sqlalchemy.types import NVARCHAR, Float, Integer
import pymysql
import pandas as pd
from sshtunnel import SSHTunnelForwarder
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
tests = []
kernels=[]
tflops=[]
dtype=[]
alayout=[]
blayout=[]
M=[]
N=[]
K=[]
StrideA=[]
StrideB=[]
StrideC=[]
#parse results, get the Tflops value for "Best Perf" kernels
glue=""
for filename in args.files:
for line in open(filename):
if 'Branch name' in line:
lst=line.split()
branch_name=lst[2]
if 'Node name' in line:
lst=line.split()
node_id=lst[2]
if 'GPU_arch' in line:
lst=line.split()
gpu_arch=lst[1]
if 'HIP version' in line:
lst=line.split()
hip_vers=lst[2]
if 'InstalledDir' in line:
lst=line.split()
rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
print("Branch name:",branch_name)
print("Node name:",node_id)
print("GPU_arch:",gpu_arch)
print("ROCM_version:",rocm_vers)
print("HIP_version:",hip_vers)
#parse gemm performance tests:
if 'gemm' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
#parse resnet50 performance tests:
if 'resnet50' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
tflops.append(lst[4])
print("Number of tests:",len(tflops))
sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"]
sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf'
sql_port = 3306
ssh_host = os.environ["dbsship"]
ssh_user = os.environ["dbsshuser"]
ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"]
with SSHTunnelForwarder(
(ssh_host, ssh_port),
ssh_username=ssh_user,
ssh_password=ssh_pass,
remote_bind_address=(sql_hostname, sql_port)) as tunnel:
sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect()
#save gemm performance tests:
if 'gemm' in filename:
#write the ck_gemm_test_params table
#only needed once the test set changes
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'''
#read baseline results for the latest develop branch
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
tflops_base = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i)
ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops for gemm tests:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
#save resnet50 performance tests:
if 'resnet50' in filename:
#read baseline results for the latest develop branch
query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256 = pd.read_sql_query(query, conn)
query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4 = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,50):
testlist.append("Layer%i"%i)
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=256 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=4 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
conn.close()
#compare the results to the baseline if baseline exists
regression=0
if 'gemm' in filename:
if not tflops_base.empty:
base=tflops_base[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(sorted_tflops[i]):
print("test # ",i,"shows regression by {:.3f}%".format(
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline")
if 'resnet50' in filename:
if not tflops_base_N256.empty:
base=tflops_base_N256[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=256")
if not tflops_base_N4.empty:
base=tflops_base_N4[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i+49]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i+49])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=4")
#return 0 if performance criteria met, otherwise return 1
return regression
if __name__ == '__main__':
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment