Unverified Commit a1b2441f authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Batchnorm inference instances, external API, client examples and gtests (#531)

* File renaming and class renaming for device element-wise operation

* Add batchnorm-infer instances, external API and client example

* Add batchnorm-infer profiler module and gtests

* Remove file device_elementwise_extension.hpp and move NormalizeInInfer operation to element_wise_operation.hpp

* Remove the using of class aliasing for DeviceElementwiseForBatchNormInfer

* Rename class and file due to conflict from device_elementwise_2d.hpp

* Fix namespace in batcnnorm_infer_nhwc client example
parent 52abc2f3
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
// clang-format off
template <index_t Rank>
using device_batchnorm_infer_f16_instances =
std::tuple <
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
>;
// clang-format on
void add_device_batchnorm_infer_rank_4_f16_instances(
std::vector<std::unique_ptr<
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 4>>>& instances)
{
add_device_operation_instances(instances, device_batchnorm_infer_f16_instances<4>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F32 = float;
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
// clang-format off
template <index_t Rank>
using device_batchnorm_infer_f32_instances =
std::tuple <
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 1, 1, 1, 1>, Sequence<4> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<1, 4, 4, 4, 4>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 2, 2, 2, 2>, Sequence<4> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<2, 4, 4, 4, 4>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, Rank, 4, Sequence<4, 4, 4, 4, 4>, Sequence<4> >
>;
// clang-format on
void add_device_batchnorm_infer_rank_4_f32_instances(
std::vector<std::unique_ptr<
DeviceElementwise<Tuple<F32, F32, F32, F32, F32>, Tuple<F32>, Normalize, 4>>>& instances)
{
add_device_operation_instances(instances, device_batchnorm_infer_f32_instances<4>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F64 = double;
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
// clang-format off
template <index_t Rank>
using device_batchnorm_infer_f64_instances =
std::tuple <
// Tuple<XDataType, MeanDataType, VarDataType, ScaleDataType, BiasDataType>, Tuple<YDataType>, NormalizeOp, Rank, MPerThread, Sequence<XVectorSize, MeanDataType, VarDataType, ScaleVectorSize, BiasVectorSize>, Sequence<YVectorSize>
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 2, Sequence<2, 2, 2, 2, 2>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<1, 1, 1, 1, 1>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<2, 1, 1, 1, 1>, Sequence<2> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<1, 2, 2, 2, 2>, Sequence<1> >,
DeviceElementwiseImpl<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, Rank, 4, Sequence<2, 2, 2, 2, 2>, Sequence<2> >
>;
// clang-format on
void add_device_batchnorm_infer_rank_4_f64_instances(
std::vector<std::unique_ptr<
DeviceElementwise<Tuple<F64, F64, F64, F64, F64>, Tuple<F64>, Normalize, 4>>>& instances)
{
add_device_operation_instances(instances, device_batchnorm_infer_f64_instances<4>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck { namespace ck {
...@@ -28,15 +28,15 @@ using Normalize = ck::tensor_operation::element_wise::Normalize; ...@@ -28,15 +28,15 @@ using Normalize = ck::tensor_operation::element_wise::Normalize;
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple< using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
// clang-format off // clang-format off
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>| //###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >, DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >, DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >, DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> > DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
// clang-format on // clang-format on
>; >;
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances( void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>& std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
instances) instances)
{ {
add_device_operation_instances( add_device_operation_instances(
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <stdexcept>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
namespace ck {
namespace profiler {
template <typename XDataType,
typename YDataType,
typename AccDataType,
typename ScaleDataType,
typename BiasDataType,
typename MeanVarDataType,
index_t Rank,
index_t NumBatchNormReduceDim>
bool profile_batchnorm_infer_impl(int do_verification,
int init_method,
bool do_dumpout,
bool time_kernel,
const std::vector<size_t> inOutLengths,
const std::vector<int> reduceDims,
double epsilon)
{
if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
{
throw std::runtime_error("Invalid tensor lengths or number of reduce dimensions!");
};
std::vector<size_t> scaleBiasMeanVarLengths;
std::vector<int> invariantDims;
// used for calculating the effective transferred bytes by each operation
size_t total_length;
size_t invariant_length = 1;
total_length =
std::accumulate(inOutLengths.begin(), inOutLengths.end(), 1, std::multiplies<size_t>{});
if(std::any_of(reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
throw std::runtime_error("Invalid reduce dimensions!");
for(int dim = 0; dim < Rank; dim++)
{
if(std::none_of(reduceDims.begin(), reduceDims.end(), [&](int d) { return dim == d; }))
{
invariantDims.push_back(dim);
scaleBiasMeanVarLengths.push_back(inOutLengths[dim]);
invariant_length *= inOutLengths[dim];
};
}
// input data of the batchnorm infer algorithm
Tensor<XDataType> x(inOutLengths);
Tensor<ScaleDataType> scale(scaleBiasMeanVarLengths);
Tensor<BiasDataType> bias(scaleBiasMeanVarLengths);
Tensor<MeanVarDataType> estimatedMean(scaleBiasMeanVarLengths);
Tensor<MeanVarDataType> estimatedVariance(scaleBiasMeanVarLengths);
// output data of the batchnorm infer algorithm
Tensor<YDataType> y_ref(inOutLengths);
Tensor<YDataType> y(inOutLengths);
auto inOutStrides = x.mDesc.GetStrides();
auto scaleBiasMeanVarStrides = scale.mDesc.GetStrides();
std::size_t num_thread = std::thread::hardware_concurrency();
const float x_mean = 0.0f;
const float x_stddev = 1.0f;
const float noise_stddev = 0.04f;
// input data in normal distribution
x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
// initialize the estimatedMean to be values with tiny variation to the mean of the x
// values
estimatedMean.GenerateTensorValue(GeneratorTensor_4<MeanVarDataType>{x_mean, noise_stddev},
num_thread);
// initialize the estimatedVariance to be values with tiny variation to the variance of
// the x values
estimatedVariance.GenerateTensorValue(
GeneratorTensor_4<MeanVarDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
if(do_verification)
{
switch(init_method)
{
case 0:
scale.GenerateTensorValue(GeneratorTensor_0<ScaleDataType>{}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_0<BiasDataType>{}, num_thread);
break;
case 1:
scale.GenerateTensorValue(GeneratorTensor_1<ScaleDataType>{1}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_1<BiasDataType>{0}, num_thread);
break;
case 2:
scale.GenerateTensorValue(GeneratorTensor_2<ScaleDataType>{-5, 5}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5}, num_thread);
break;
default:
scale.GenerateTensorValue(GeneratorTensor_3<ScaleDataType>{-1.0f, 1.0f}, num_thread);
bias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{-1.0f, 1.0f}, num_thread);
}
};
// these buffers are usually provided by the user application
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(XDataType) * y.mDesc.GetElementSpaceSize());
DeviceMem scale_dev(sizeof(ScaleDataType) * scale.mDesc.GetElementSpaceSize());
DeviceMem bias_dev(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
// estimatedMean_dev
DeviceMem estimatedMean_dev(sizeof(MeanVarDataType) *
estimatedMean.mDesc.GetElementSpaceSize());
// estimatedVariance_dev
DeviceMem estimatedVariance_dev(sizeof(MeanVarDataType) *
estimatedVariance.mDesc.GetElementSpaceSize());
x_dev.ToDevice(x.mData.data());
scale_dev.ToDevice(scale.mData.data());
bias_dev.ToDevice(bias.mData.data());
estimatedMean_dev.ToDevice(estimatedMean.mData.data());
estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
std::array<index_t, Rank> arrInOutLengths;
std::array<index_t, Rank> arrInOutStrides;
std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
std::array<int, NumBatchNormReduceDim> arrReduceDims;
std::copy(inOutLengths.begin(), inOutLengths.end(), arrInOutLengths.begin());
std::copy(inOutStrides.begin(), inOutStrides.end(), arrInOutStrides.begin());
std::copy(scaleBiasMeanVarLengths.begin(),
scaleBiasMeanVarLengths.end(),
arrScaleBiasMeanVarLengths.begin());
std::copy(scaleBiasMeanVarStrides.begin(),
scaleBiasMeanVarStrides.end(),
arrScaleBiasMeanVarStrides.begin());
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{0};
int i = 0;
for(auto dim : invariantDims)
{
assert(inOutLengths[dim] == scaleBiasMeanVarLengths[i]);
aligned_scaleBiasMeanVarStrides[dim] = scaleBiasMeanVarStrides[i];
i++;
};
using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer;
// add device batchnorm-infer instances
using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
ck::Tuple<YDataType>,
Normalize,
Rank>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
if(do_verification)
{
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
using ReferenceBatchNormInferInstance =
ck::tensor_operation::host::ReferenceBatchNormInfer<XDataType,
YDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
PassThroughOp,
Rank,
NumBatchNormReduceDim>;
auto batchNormInfer_ref = ReferenceBatchNormInferInstance{};
auto argument_ptr_ref =
batchNormInfer_ref.MakeArgumentPointer(arrInOutLengths,
arrInOutStrides,
arrInOutStrides,
arrReduceDims,
arrScaleBiasMeanVarLengths,
arrScaleBiasMeanVarStrides,
arrScaleBiasMeanVarStrides,
arrScaleBiasMeanVarStrides,
x.mData.data(),
scale.mData.data(),
bias.mData.data(),
epsilon,
PassThroughOp{},
estimatedMean.mData.data(),
estimatedVariance.mData.data(),
y_ref.mData.data());
if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
{
std::cout << "The runtime parameters not supported by the reference instance, exiting!"
<< std::endl;
return (false);
};
auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
(void)invoker_ptr_ref->Run(argument_ptr_ref.get());
}
int num_kernel = 0;
bool pass = true;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(arrInOutLengths,
{arrInOutStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides,
aligned_scaleBiasMeanVarStrides},
{arrInOutStrides},
{x_dev.GetDeviceBuffer(),
estimatedMean_dev.GetDeviceBuffer(),
estimatedVariance_dev.GetDeviceBuffer(),
scale_dev.GetDeviceBuffer(),
bias_dev.GetDeviceBuffer()},
{y_dev.GetDeviceBuffer()},
Normalize{epsilon});
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
num_kernel++;
}
else
{
if(time_kernel)
{
std::cout << inst_ptr->GetTypeString()
<< " skipped due to unsupported argument: " << std::endl;
}
continue;
};
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
size_t num_bytes = 0;
// inputing of x, scale, bias, outputing of y
num_bytes += total_length * (sizeof(XDataType) + sizeof(YDataType)) +
invariant_length *
(sizeof(ScaleDataType) + sizeof(BiasDataType) + sizeof(MeanVarDataType));
float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel)
std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time)
{
best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
using ck::utils::check_err;
bool single_pass;
y_dev.FromDevice(y.mData.data());
if constexpr(ck::is_same_v<YDataType, ck::bhalf_t>)
single_pass = check_err(y.mData, y_ref.mData, "y results", 1e-2, 1e-2);
else
single_pass = check_err(y.mData, y_ref.mData, "y results", 4e-3, 4e-3);
pass = pass && single_pass;
};
if(do_dumpout)
{
using ck::host_common::dumpBufferToFile;
// clang-format off
dumpBufferToFile("dump_x.bin", x.mData.data(), x.mDesc.GetElementSize());
dumpBufferToFile("dump_y.bin", y.mData.data(), y.mDesc.GetElementSize());
dumpBufferToFile("dump_y_ref.bin", y_ref.mData.data(), y_ref.mDesc.GetElementSize());
// clang-format off
};
}
if(time_kernel)
{
std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is applicable" << std::endl;
return false;
}
return pass;
}
} // namespace profiler
} // namespace ck
...@@ -27,6 +27,7 @@ set(PROFILER_SOURCES ...@@ -27,6 +27,7 @@ set(PROFILER_SOURCES
profile_softmax.cpp profile_softmax.cpp
profile_batchnorm_fwd.cpp profile_batchnorm_fwd.cpp
profile_batchnorm_bwd.cpp profile_batchnorm_bwd.cpp
profile_batchnorm_infer.cpp
) )
set(PROFILER_EXECUTABLE ckProfiler) set(PROFILER_EXECUTABLE ckProfiler)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <getopt.h>
#include "ck/library/utility/host_common_util.hpp"
#include "profiler/profile_batchnorm_infer_impl.hpp"
#include "profiler_operation_registry.hpp"
using ck::index_t;
using namespace std;
static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
{"reduceDims", required_argument, nullptr, 'R'},
{"dumpout", required_argument, nullptr, 'o'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};
class BatchnormInferArgParser
{
private:
int option_index = 0;
public:
std::vector<size_t> inLengths;
std::vector<int> reduceDims;
bool do_verification = false;
bool do_dumpout = false;
bool updateMovingAverage;
bool saveMeanAndInvVariance;
int data_type = 0;
int init_method = 2;
bool time_kernel = false;
BatchnormInferArgParser() = default;
~BatchnormInferArgParser() = default;
void show_usage(const char* cmd)
{
// clang-format off
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
// clang-format on
};
int operator()(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;
int ch;
optind++; // to skip the module name
while(1)
{
ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
{
case 'D':
if(!optarg)
throw std::runtime_error("Invalid option format!");
inLengths = getTypeValuesFromString<size_t>(optarg);
break;
case 'R':
if(!optarg)
throw std::runtime_error("Invalid option format!");
reduceDims = getTypeValuesFromString<int>(optarg);
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_verification = static_cast<bool>(std::atoi(optarg));
break;
case 'o':
if(!optarg)
throw std::runtime_error("Invalid option format!");
do_dumpout = static_cast<bool>(std::atoi(optarg));
break;
case '?':
if(std::string(long_options[option_index].name) == "help")
{
show_usage(argv[0]);
return -1;
};
break;
default:
show_usage(argv[0]);
std::cerr << "Invalid cmd-line options!" << std::endl;
return -1;
};
};
if(optind + 3 > argc)
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
data_type = std::atoi(argv[optind++]);
init_method = std::atoi(argv[optind++]);
time_kernel = static_cast<bool>(std::atoi(argv[optind++]));
if(data_type != 0 && data_type != 1 && data_type != 5 && data_type != 6)
return -1;
return 0;
};
}; // end of class AppArgs
static const double epsilon = std::numeric_limits<float>::epsilon();
int profile_batchnorm_infer(int argc, char* argv[])
{
using ck::profiler::profile_batchnorm_infer_impl;
BatchnormInferArgParser arg_parser;
if(arg_parser(argc, argv) != 0)
return -1;
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using F64 = double;
if(arg_parser.data_type == 0)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_infer_impl<F16, F16, F32, F16, F16, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
epsilon);
};
}
else if(arg_parser.data_type == 1)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_infer_impl<F32, F32, F32, F32, F32, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
epsilon);
};
}
else if(arg_parser.data_type == 5)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_infer_impl<BF16, BF16, F32, BF16, BF16, F32, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
epsilon);
};
}
else if(arg_parser.data_type == 6)
{
if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
{
profile_batchnorm_infer_impl<F64, F64, F64, F64, F64, F64, 4, 3>(
arg_parser.do_verification,
arg_parser.init_method,
arg_parser.do_dumpout,
arg_parser.time_kernel,
arg_parser.inLengths,
arg_parser.reduceDims,
epsilon);
};
}
return 0;
}
REGISTER_PROFILER_OPERATION("bnorm_infer", "Batchnorm inference", profile_batchnorm_infer);
add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp) add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp) add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
add_gtest_executable(test_batchnorm_infer_rank_4 batchnorm_infer_rank_4.cpp)
target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance) target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance) target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
target_link_libraries(test_batchnorm_infer_rank_4 PRIVATE utility device_batchnorm_instance)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <initializer_list>
#include <vector>
#include <tuple>
#include <gtest/gtest.h>
#include "profiler/profile_batchnorm_infer_impl.hpp"
using F16 = ck::half_t;
using F32 = float;
using BF16 = ck::bhalf_t;
using F64 = double;
template <typename Tuple>
class TestBatchNormInferRank4 : public ::testing::Test
{
private:
const double epsilon = std::numeric_limits<float>::epsilon();
protected:
using XDataType = std::tuple_element_t<0, Tuple>;
using YDataType = std::tuple_element_t<1, Tuple>;
using AccDataType = std::tuple_element_t<2, Tuple>;
using ScaleDataType = std::tuple_element_t<3, Tuple>;
using BiasDataType = std::tuple_element_t<4, Tuple>;
using MeanVarDataType = std::tuple_element_t<5, Tuple>;
std::vector<std::vector<size_t>> list_of_lengths = {
{128, 16, 3, 1024}, {128, 16, 6, 512}, {4, 4, 4, 4}, {32, 32, 32, 32}};
std::vector<int> reduceDims;
template <int NumReduceDim>
void Run()
{
for(auto& inOutLengths : list_of_lengths)
{
bool pass = true;
EXPECT_FALSE(reduceDims.size() != NumReduceDim);
pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
YDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
4,
NumReduceDim>(
true, 3, false, false, inOutLengths, reduceDims, epsilon);
pass = pass && ck::profiler::profile_batchnorm_infer_impl<XDataType,
YDataType,
AccDataType,
ScaleDataType,
BiasDataType,
MeanVarDataType,
4,
NumReduceDim>(
true, 3, false, false, inOutLengths, reduceDims, epsilon);
EXPECT_TRUE(pass);
}
}
};
using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
std::tuple<F32, F32, F32, F32, F32, F32>,
std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
std::tuple<F64, F64, F64, F64, F64, F64>>;
TYPED_TEST_SUITE(TestBatchNormInferRank4, KernelTypes);
// nhwc
TYPED_TEST(TestBatchNormInferRank4, nhwc)
{
this->reduceDims = {0, 1, 2};
this->template Run<3>();
}
// nchw
TYPED_TEST(TestBatchNormInferRank4, nchw)
{
this->reduceDims = {0, 2, 3};
this->template Run<3>();
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment