Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/math/variadic_elementwise_ops_impl.h"
#include "core/providers/rocm/cu_inc/variadic_elementwise_impl.cuh"
#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
#include "core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh"
#include "core/providers/rocm/math/variadic_elementwise_ops_tags.h"
namespace onnxruntime {
namespace rocm {
template <typename T, typename VariadicElementwiseOpTag>
struct VariadicElementwiseOpTraits;
#define DEFINE_TRAITS(VariadicElementwiseOpTag, ImplName) \
template <typename T> \
struct VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag> { \
using ScalarComputeFunctor = OP_##ImplName<T, T, T>; \
\
static void ComputeFn( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count) { \
Impl_##ImplName( \
stream, \
output_rank_or_simple_broadcast, \
lhs_padded_strides, \
lhs_data, \
rhs_padded_strides, \
rhs_data, \
fdm_output_strides, \
fdm_H, \
fdm_C, \
output_data, \
count); \
} \
};
DEFINE_TRAITS(variadic_elementwise_ops::Sum, Add)
DEFINE_TRAITS(variadic_elementwise_ops::Min, Min)
DEFINE_TRAITS(variadic_elementwise_ops::Max, Max)
#undef DEFINE_TRAITS
template <typename T, typename VariadicElementwiseOpTag>
void Impl_General(
hipStream_t stream,
int32_t output_rank_or_simple_broadcast,
const TArray<int64_t>* lhs_padded_strides,
const T* lhs_data,
const TArray<int64_t>* rhs_padded_strides,
const T* rhs_data,
const TArray<fast_divmod>* fdm_output_strides,
const fast_divmod& fdm_H,
const fast_divmod& fdm_C,
T* output_data,
size_t count) {
VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag>::ComputeFn(
stream,
output_rank_or_simple_broadcast,
lhs_padded_strides,
lhs_data,
rhs_padded_strides,
rhs_data,
fdm_output_strides,
fdm_H,
fdm_C,
output_data,
count);
}
template <typename T, typename VariadicElementwiseOpTag>
void Impl_NoBroadcastInputBatch(
hipStream_t stream,
InputBatchArray<T> input_data_batch,
T* output_data,
size_t count) {
VariadicElementWiseNoBroadcastInputBatchImpl<
T, typename VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag>::ScalarComputeFunctor,
k_max_input_batch_size>(
stream,
typename VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag>::ScalarComputeFunctor{},
count,
input_data_batch,
output_data);
}
#define SPECIALIZE_IMPL(T, VariadicElementwiseOpTag) \
template void Impl_General<T, VariadicElementwiseOpTag>( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count); \
\
template void Impl_NoBroadcastInputBatch<T, VariadicElementwiseOpTag>( \
hipStream_t stream, \
InputBatchArray<T> input_data_batch, \
T * output_data, \
size_t count);
// the postfix means the types supported by the op:
// B: uint8_t
// W: uint16_t
// U: uint32_t
// Z: uint64_t
// C: int8_t
// S: int16_t
// I: int32_t
// L: int64_t
// H: float16
// F: float
// D: double
// O: bool
#define SPECIALIZE_IMPL_HFD(VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(half, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(float, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(double, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(BFloat16, VariadicElementwiseOpTag)
#define SPECIALIZE_IMPL_UZILHFD(VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(uint32_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(uint64_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(int32_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(int64_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL_HFD(VariadicElementwiseOpTag)
SPECIALIZE_IMPL_HFD(variadic_elementwise_ops::Sum)
SPECIALIZE_IMPL_UZILHFD(variadic_elementwise_ops::Min)
SPECIALIZE_IMPL_UZILHFD(variadic_elementwise_ops::Max)
#undef SPECIALIZE_IMPL_UZILHFD
#undef SPECIALIZE_IMPL_HFD
#undef SPECIALIZE_IMPL
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <cstdint>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T, typename VariadicElementwiseOpTag>
void Impl_General(
hipStream_t stream,
int32_t output_rank_or_simple_broadcast,
const TArray<int64_t>* lhs_padded_strides,
const T* lhs_data,
const TArray<int64_t>* rhs_padded_strides,
const T* rhs_data,
const TArray<fast_divmod>* fdm_output_strides,
const fast_divmod& fdm_H,
const fast_divmod& fdm_C,
T* output_data,
size_t count);
constexpr int32_t k_max_input_batch_size = 8;
template <typename T>
using InputBatchArray = TArray<const T*, k_max_input_batch_size>;
template <typename T, typename VariadicElementwiseOpTag>
void Impl_NoBroadcastInputBatch(
hipStream_t stream,
InputBatchArray<T> input_data_batch,
T* output_data,
size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace onnxruntime {
namespace rocm {
namespace variadic_elementwise_ops {
struct Sum {};
struct Min {};
struct Max {};
} // namespace variadic_elementwise_ops
} // namespace rocm
} // namespace onnxruntime
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#pragma once
#include <vector>
#include "core/common/common.h"
#include "core/common/gsl.h"
namespace onnxruntime {
namespace rocm {
// initial reference from:
// https://github.com/NVIDIA/apex/blob/5b71d3695bf39efcdcda9dff5be2f70314b8f091/csrc/multi_tensor_apply.cuh#L15
// further experiment to get the number below. The larger the better, but if too large, it won't fit into GPU stack.
constexpr int ACTUAL_TENSOR_GROUP_SIZE[8] = {1, 1, 2, 3, 4, 5, 6, 7};
constexpr int MAX_BLOCK_COUNTS[8] = {256, 320, 320, 320, 320, 288, 288, 256};
constexpr int MAX_TENSOR_GROUP_COUNTS[8] = {1, 96, 64, 32, 32, 32, 32, 32};
constexpr int MAX_BLOCK_THREAD_COUNTS[8] = {256, 512, 512, 512, 512, 512, 512, 512};
// TensorGroupSize is the number of parallel tensors. For element-wise
// operators such as Relu, it should be 1. For two-operand operators such as
// element-wise addition, it should be 2. The value 0 is reserved for implementing
// kernels to handle a single large tensor.
template <int TensorGroupSize>
struct ChunkGroup {
// Number of chunks in this ChunkGroup.
// It's the effective size of block_index_to_tensor_group_index and
// block_index_to_chunk_start_index.
// The i-th chunk starts at the block_index_to_chunk_start_index[i]-th
// element in the block_index_to_tensor_group_index[i]-th tensor.
int chunk_count = 0;
// Max number of elements in each chunk in this ChunkGroup.
// It's an upper bound because chunks locating in the end of tensors
// are not always full. For example, if we split a 7-element vector into
// two 4-element chunks, the second chunk may contain only 3 actual values.
int chunk_size = 0;
// blkIdx.x block processes chunks in block_index_to_tensor_group_index[blkIdx.x]-th
// tensor group. Each chunk starts from block_index_to_chunk_start_index[blkIdx.x]-th
// element until reaching the end of this chunk or the end of the whole tensor.
//
// Let i = block_index_to_tensor_group_index[blkIdx.x]
// n = tensor_sizes[i]
// b = block_index_to_chunk_start_index[blkIdx.x]
// e = min(b + chunk_size, n)
// The valid index range for blockIdx.x is defined by the following equation.
// b <= valid index < e
int block_index_to_tensor_group_index[MAX_BLOCK_COUNTS[TensorGroupSize]];
int block_index_to_chunk_start_index[MAX_BLOCK_COUNTS[TensorGroupSize]];
int tensor_sizes[MAX_TENSOR_GROUP_COUNTS[TensorGroupSize]];
// The addresses of tensors where the chunks are extracted from.
// 1. tensor_ptrs[0][i], ..., tensor_ptrs[TensorGroupSize-1][i] are
// the tensors' pointers in the i-th group.
// 2. All tensors in the i-th group have the same size, tensor_sizes[i].
void* tensor_ptrs[ACTUAL_TENSOR_GROUP_SIZE[TensorGroupSize]][MAX_TENSOR_GROUP_COUNTS[TensorGroupSize]];
// Max number of GPU blocks to process the chunks in this chunk group.
const static int max_block_count = MAX_BLOCK_COUNTS[TensorGroupSize];
// Max number of tensor groups in this chunk group.
const static int max_tensor_group_count = MAX_TENSOR_GROUP_COUNTS[TensorGroupSize];
// The suggested number of threads to launch per GPU block.
const static int thread_count_per_block = MAX_BLOCK_THREAD_COUNTS[TensorGroupSize];
};
template <int TensorGroupSize>
int compute_max_tensor_size_per_launch(int element_count_per_thread) {
constexpr int block_count =
ChunkGroup<TensorGroupSize>::max_block_count;
constexpr int thread_count_per_block =
ChunkGroup<TensorGroupSize>::thread_count_per_block;
return block_count * thread_count_per_block * element_count_per_thread;
}
template <int TensorGroupSize, typename TMultiTensorFunctor, typename... TFunctorParams>
void launch_multi_tensor_functor(
hipStream_t stream,
const int chunk_size,
gsl::span<int> tensor_sizes,
gsl::span<std::vector<void*>> grouped_tensor_pointers,
TMultiTensorFunctor multipleTensorKernel,
TFunctorParams&&... kernelParams) {
// Check if 32-bit integer is enough.
ORT_ENFORCE(tensor_sizes.size() > 0);
ORT_ENFORCE(tensor_sizes.size() < static_cast<size_t>(INT_MAX));
ORT_ENFORCE(grouped_tensor_pointers.size() > 0);
ORT_ENFORCE(grouped_tensor_pointers.size() < static_cast<size_t>(INT_MAX));
ORT_ENFORCE(chunk_size > 0);
// Number of groups, for example, the number of updated weight tensors in Lamb optimizer.
const int group_count = static_cast<int>(grouped_tensor_pointers.size());
// Tensor count per group.
const int group_size = static_cast<int>(grouped_tensor_pointers[0].size());
int tensor_group_index = 0;
int block_index = 0;
ORT_ENFORCE(grouped_tensor_pointers.size() == tensor_sizes.size());
ORT_ENFORCE(group_size == ACTUAL_TENSOR_GROUP_SIZE[TensorGroupSize]);
for (int i = 0; i < group_count; ++i) {
ORT_ENFORCE(grouped_tensor_pointers[i].size() == static_cast<size_t>(group_size));
}
// Handle multiple tensors per ROCM kernel call.
ChunkGroup<TensorGroupSize> chunk_group;
for (int i = 0; i < group_count; ++i) {
// Add pointers to one group of tensors into chunk_group.
for (int j = 0; j < group_size; ++j) {
chunk_group.tensor_ptrs[j][tensor_group_index] = grouped_tensor_pointers[i][j];
}
// Assuming that all tensors' shapes are the same, we just record w's size.
chunk_group.tensor_sizes[tensor_group_index] = tensor_sizes[i];
chunk_group.chunk_size = chunk_size;
const int chunk_count = (tensor_sizes[i] + chunk_size - 1) / chunk_size;
// Process all chunks in this tensor group.
for (int chunk_index = 0; chunk_index < chunk_count; ++chunk_index) {
chunk_group.block_index_to_tensor_group_index[block_index] = tensor_group_index;
chunk_group.block_index_to_chunk_start_index[block_index] = chunk_index * chunk_size;
// After ++block_index, block_index becomes the count of chunks in chunk_group.
++block_index;
chunk_group.chunk_count = block_index;
if (block_index == chunk_group.max_block_count) {
multipleTensorKernel(stream, chunk_group, std::forward<TFunctorParams>(kernelParams)...);
block_index = 0;
}
}
// After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group.
++tensor_group_index;
if (tensor_group_index == chunk_group.max_tensor_group_count) {
multipleTensorKernel(stream, chunk_group, std::forward<TFunctorParams>(kernelParams)...);
block_index = 0;
tensor_group_index = 0;
}
}
// This round of processing tensor group is finished.
// All the groups remain in chunk group should be processed right now.
if (block_index != 0) {
multipleTensorKernel(stream, chunk_group, std::forward<TFunctorParams>(kernelParams)...);
block_index = 0;
tensor_group_index = 0;
}
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "batch_norm.h"
#include "core/providers/common.h"
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/cpu/nn/batch_norm_helper.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
using namespace std;
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
7, 8, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
9, 13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
14, 14, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("U", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
15, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>);
template <typename T>
Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
const Tensor* B = p_op_kernel_context->Input<Tensor>(2);
const Tensor* mean = p_op_kernel_context->Input<Tensor>(3);
const Tensor* var = p_op_kernel_context->Input<Tensor>(4);
ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1));
const TensorShape& x_shape = X->Shape();
const TensorShape& channel_shape = mean->Shape();
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
Tensor* running_mean = p_op_kernel_context->Output(1, channel_shape);
Tensor* running_var = p_op_kernel_context->Output(2, channel_shape);
Tensor* saved_mean = p_op_kernel_context->Output(3, channel_shape);
Tensor* saved_var = p_op_kernel_context->Output(4, channel_shape);
auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
auto scale_data = reinterpret_cast<const HipT*>(scale->Data<T>());
auto b_data = reinterpret_cast<const HipT*>(B->Data<T>());
auto mean_data = reinterpret_cast<const HipT*>(mean->Data<T>());
auto var_data = reinterpret_cast<const HipT*>(var->Data<T>());
auto y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
const auto alpha = Consts<HipT>::One;
const auto beta = Consts<HipT>::Zero;
MiopenTensor data_desc;
vector<int64_t> new_dims;
BatchNormHelper::NormalizeDims(x_shape, new_dims);
ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, MiopenTensor::GetDataType<HipT>()));
// For half data type, the alpha, beta, scale, B, mean, var need to be float type
if (X->IsDataType<MLFloat16>()) {
MiopenTensor scale_desc;
ORT_RETURN_IF_ERROR(scale_desc.Set(new_dims, MiopenTensor::GetDataType<float>()));
MiopenTensor bn_tensor_desc;
ORT_RETURN_IF_ERROR(bn_tensor_desc.Set(data_desc, miopen_batch_norm_mode_));
// Convert the scale, B, mean, var to float
const int64_t C = x_shape.GetDims()[1];
auto f_scale = GetScratchBuffer<float>(C);
auto f_B = GetScratchBuffer<float>(C);
auto f_mean = GetScratchBuffer<float>(C);
auto f_var = GetScratchBuffer<float>(C);
Impl_Cast<HipT, float>(Stream(), scale_data, f_scale.get(), C);
Impl_Cast<HipT, float>(Stream(), b_data, f_B.get(), C);
Impl_Cast<HipT, float>(Stream(), mean_data, f_mean.get(), C);
Impl_Cast<HipT, float>(Stream(), var_data, f_var.get(), C);
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardInferenceHelper(
MiopenHandle(),
miopen_batch_norm_mode_,
&alpha,
&beta,
data_desc,
x_data,
data_desc,
y_data,
bn_tensor_desc,
f_scale.get(),
f_B.get(),
f_mean.get(),
f_var.get(),
epsilon_));
return Status::OK();
}
MiopenTensor bn_tensor_desc;
ORT_RETURN_IF_ERROR(bn_tensor_desc.Set(data_desc, miopen_batch_norm_mode_));
// in BatchNorm Forward Training mode if all 5 outputs present
if (running_mean && running_var && saved_mean && saved_var) {
auto running_mean_data = reinterpret_cast<HipT*>(running_mean->MutableData<T>());
auto running_var_data = reinterpret_cast<HipT*>(running_var->MutableData<T>());
auto saved_mean_data = reinterpret_cast<HipT*>(saved_mean->MutableData<T>());
auto saved_inv_var_data = reinterpret_cast<HipT*>(saved_var->MutableData<T>());
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
MiopenHandle(),
miopen_batch_norm_mode_,
&alpha,
&beta,
data_desc,
x_data,
data_desc,
y_data,
bn_tensor_desc,
scale_data,
b_data,
momentum_,
running_mean_data,
running_var_data,
epsilon_,
saved_mean_data,
saved_inv_var_data));
// in BatchNorm Forward Inference mode if only Y output present
} else {
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardInferenceHelper(
MiopenHandle(),
miopen_batch_norm_mode_,
&alpha,
&beta,
data_desc,
x_data,
data_desc,
y_data,
bn_tensor_desc,
scale_data,
b_data,
mean_data,
var_data,
epsilon_));
}
return Status::OK();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status BatchNorm<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE(float)
SPECIALIZED_COMPUTE(double)
SPECIALIZED_COMPUTE(MLFloat16)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class BatchNorm final : public RocmKernel {
public:
BatchNorm(const OpKernelInfo& op_kernel_info)
: RocmKernel{op_kernel_info},
miopen_batch_norm_mode_(miopenBNSpatial),
momentum_(0.9) {
float tmp_epsilon;
ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
epsilon_ = ClampMiopenBatchNormEpsilon(static_cast<double>(tmp_epsilon));
// spatial or not
int64_t tmp_spatial;
if (op_kernel_info.GetAttr<int64_t>("spatial", &tmp_spatial).IsOK()) {
spatial_ = tmp_spatial;
}
if (spatial_ == 0) {
miopen_batch_norm_mode_ = miopenBNPerActivation;
}
float tmp_momentum;
if (op_kernel_info.GetAttr<float>("momentum", &tmp_momentum).IsOK()) {
momentum_ = static_cast<double>(tmp_momentum);
}
is_training_mode_ = (op_kernel_info.GetAttrOrDefault<int64_t>("training_mode", 0) == 1);
const auto& node = op_kernel_info.node();
auto opset = node.SinceVersion();
// batch norm opset 14 (or higher) is not implemented for training mode
ORT_ENFORCE(!(is_training_mode_ && opset >= 14), "Training mode does not support BN opset 14 (or higher) yet.");
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
double epsilon_;
int64_t spatial_ = 1; // default as per spec
miopenBatchNormMode_t miopen_batch_norm_mode_;
double momentum_;
bool is_training_mode_ = 0; //default as per spec
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/nn/dropout.h"
#include "core/providers/rocm/nn/dropout_impl.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
namespace {
template <typename T>
struct GetRatioDataImpl {
void operator()(const Tensor* ratio, float& ratio_data) const {
ratio_data = static_cast<float>(*(ratio->Data<T>()));
ORT_ENFORCE(ratio_data >= 0.0f && ratio_data < 1.0f, "ratio_data is outside range [0, 1)");
}
};
template <typename T>
struct DropoutComputeImpl {
void operator()(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N, const int64_t mask_element_count,
const float ratio_data, PhiloxGenerator& generator, const Tensor& X, Tensor& Y, void* mask_data,
bool use_bitmask) const {
typedef typename ToHipType<T>::MappedType HipT;
const HipT* X_data = reinterpret_cast<const HipT*>(X.Data<T>());
HipT* Y_data = reinterpret_cast<HipT*>(Y.MutableData<T>());
DropoutKernelImpl<HipT>(prop, stream, N, mask_element_count, ratio_data, generator, X_data, Y_data, mask_data,
use_bitmask);
}
};
} // namespace
ONNX_OPERATOR_VERSIONED_KERNEL_EX(Dropout, kOnnxDomain, 12, 12, kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes())
.TypeConstraint("T1", DataTypeImpl::AllIEEEFloatTensorTypes())
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.InputMemoryType(OrtMemTypeCPUInput, 2),
Dropout<false>);
ONNX_OPERATOR_KERNEL_EX(Dropout, kOnnxDomain, 13, kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", BuildKernelDefConstraints<MLFloat16, float, double, BFloat16>())
.TypeConstraint("T1", BuildKernelDefConstraints<MLFloat16, float, double, BFloat16>())
.TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.InputMemoryType(OrtMemTypeCPUInput, 2),
Dropout<false>);
template <bool UseBitmask>
Status Dropout<UseBitmask>::ComputeInternal(OpKernelContext* context) const {
// Get X_data
const Tensor* X = context->Input<Tensor>(0);
if (!X) return Status(common::ONNXRUNTIME, common::FAIL, "X Input is not available.");
const TensorShape& shape = X->Shape();
const int64_t N = shape.Size();
// Get Y_data
auto Y = context->Output(0, shape);
// Get mask_data
Tensor* mask = nullptr;
int64_t mask_element_count = N;
if (UseBitmask) {
mask_element_count = (N + kNumBitsPerBitmaskElement - 1) / kNumBitsPerBitmaskElement;
mask = context->Output(1, {mask_element_count});
} else {
mask = context->Output(1, shape);
}
ORT_ENFORCE(!mask || mask->Shape().Size() == mask_element_count);
// Get the ratio_data
float ratio_data = default_ratio_;
auto ratio = context->Input<Tensor>(1);
if (ratio) {
utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(ratio->GetElementType());
t_disp.Invoke<GetRatioDataImpl>(ratio, ratio_data);
}
const Tensor* training_mode = context->Input<Tensor>(2);
// Check for inference mode.
if (ratio_data == 0.f || !training_mode || !(*(training_mode->Data<bool>()))) {
const void* X_data = X->DataRaw();
void* Y_data = Y->MutableDataRaw();
if (Y_data != X_data) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y_data, X_data, X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream()));
}
// If mask is requested, return all 1s.
if (mask) {
if (UseBitmask) {
HIP_RETURN_IF_ERROR(
hipMemsetAsync(mask->MutableDataRaw(), -1, mask_element_count * sizeof(BitmaskElementType), Stream()));
} else {
HIP_RETURN_IF_ERROR(
hipMemsetAsync(mask->MutableData<bool>(), true, mask_element_count * sizeof(bool), Stream()));
}
}
return Status::OK();
}
IAllocatorUniquePtr<void> temp_mask_buffer{}; // buffer to use if mask is not provided
void* const mask_data = [this, mask_element_count, mask, &temp_mask_buffer]() {
if (mask) return mask->MutableDataRaw();
temp_mask_buffer =
GetScratchBuffer<void>(mask_element_count * (UseBitmask ? sizeof(BitmaskElementType) : sizeof(bool)));
return temp_mask_buffer.get();
}();
PhiloxGenerator& generator = generator_ ? *generator_ : PhiloxGenerator::Default();
utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(X->GetElementType());
t_disp.Invoke<DropoutComputeImpl>(GetDeviceProp(), Stream(), N, mask_element_count, ratio_data, generator, *X, *Y,
mask_data, UseBitmask);
return Status::OK();
}
// Instantiation for Dropout.
template class Dropout<false>;
template class Dropout<true>;
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/common.h"
#include "core/framework/random_generator.h"
namespace onnxruntime {
namespace rocm {
template <bool UseBitmask>
class Dropout final : public RocmKernel {
public:
Dropout(const OpKernelInfo& info) : RocmKernel(info) {
int64_t seed = 0;
if (info.GetAttr<int64_t>("seed", &seed).IsOK()) {
generator_ = std::make_unique<PhiloxGenerator>(static_cast<uint64_t>(seed));
}
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
mutable std::unique_ptr<PhiloxGenerator> generator_;
static constexpr float default_ratio_ = 0.5f;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Modifications Copyright (c) Microsoft. */
#include "core/providers/rocm/nn/dropout_impl.h"
#include <hiprand_kernel.h>
#include <algorithm>
#include "core/providers/rocm/cu_inc/bitmask.cuh"
namespace onnxruntime {
namespace rocm {
constexpr int kBlockSize = 256;
constexpr int kNumUnroll = 4;
template <typename T, bool UseBitmask>
__global__ void DropoutKernel(const HIP_LONG N, const HIP_LONG mask_element_count, const int step_size,
const int steps_per_thread, const fast_divmod fdm_bits_per_element, const float ratio,
const std::pair<uint64_t, uint64_t> seeds, const T* X_data, T* Y_data, void* mask_data) {
HIP_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
const float p = 1.0f - ratio;
const float scale = 1.0f / p;
hiprandStatePhilox4_32_10_t state;
hiprand_init(seeds.first, idx, seeds.second, &state);
float4 rand;
// We ensure every thread generates the same number of random numbers (by rounding
// up the size) and at the same timestep (by syncing threads).
// From ROCM hiprand documentation:
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
// Each thread computes 4 random numbers in the same time thus the most efficient
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
for (int i = 0; i < steps_per_thread; ++i) {
HIP_LONG id = idx * kNumUnroll + i * step_size;
rand = hiprand_uniform4(&state);
BitmaskElementType thread_bitmask = 0;
// actual computation
#pragma unroll
for (int i = 0; i < kNumUnroll; ++i) {
HIP_LONG li = id + i;
if (li < N) {
bool mask = (&rand.x)[i] < p;
Y_data[li] = static_cast<T>(static_cast<float>(X_data[li]) * mask * scale);
if (UseBitmask) {
thread_bitmask |= (mask << i);
} else {
reinterpret_cast<bool*>(mask_data)[li] = mask;
}
}
}
if (UseBitmask) {
SetBitmask<kNumUnroll>(id, mask_element_count, fdm_bits_per_element, thread_bitmask,
reinterpret_cast<BitmaskElementType*>(mask_data));
}
__syncthreads();
}
}
template <typename T, bool UseBitmask>
__global__ void DropoutVectorizedKernel(const HIP_LONG N, const HIP_LONG mask_element_count, const int step_size,
const int steps_per_thread, const fast_divmod fdm_bits_per_element,
const float ratio, const std::pair<uint64_t, uint64_t> seeds, const T* X_data,
T* Y_data, void* mask_data) {
HIP_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
const float p = 1.0f - ratio;
const float scale = 1.0f / p;
hiprandStatePhilox4_32_10_t state;
hiprand_init(seeds.first, idx, seeds.second, &state);
float4 rand;
// using vectorized data load/store approach when N % 4 == 0 since this is
// typical case for input shape size
using LoadT = aligned_vector<T, kNumUnroll>;
using MaskLoadT = aligned_vector<bool, kNumUnroll>;
for (int i = 0; i < steps_per_thread; ++i) {
HIP_LONG id = idx * kNumUnroll + i * step_size;
rand = hiprand_uniform4(&state);
BitmaskElementType thread_bitmask = 0;
if (id < N) {
// vectorized load into storage
T src[kNumUnroll];
LoadT* value = reinterpret_cast<LoadT*>(&src);
*value = *reinterpret_cast<const LoadT*>(&X_data[id]);
T r[kNumUnroll];
bool masks[kNumUnroll];
// actual computation
#pragma unroll
for (int ii = 0; ii < kNumUnroll; ++ii) {
bool mask = (&rand.x)[ii] < p;
r[ii] = static_cast<T>(static_cast<float>(src[ii]) * mask * scale);
if (UseBitmask) {
thread_bitmask |= (mask << ii);
} else {
masks[ii] = mask;
}
}
// Vectorized writes for mask_data & Y_data
*(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
if (!UseBitmask) {
*(reinterpret_cast<MaskLoadT*>(&reinterpret_cast<bool*>(mask_data)[id])) =
*reinterpret_cast<MaskLoadT*>(&masks[0]);
}
}
if (UseBitmask) {
SetBitmask<kNumUnroll>(id, mask_element_count, fdm_bits_per_element, thread_bitmask,
reinterpret_cast<BitmaskElementType*>(mask_data));
}
__syncthreads();
}
}
#define LAUNCH_DROPOUT_KERNEL(FuncName, UseBitmask) \
hipLaunchKernelGGL(HIP_KERNEL_NAME(FuncName<T, UseBitmask>), grid_size, kBlockSize, 0, stream, \
static_cast<HIP_LONG>(N), static_cast<HIP_LONG>(mask_element_count), step_size, steps_per_thread, \
fdm_bits_per_element, ratio, seeds, X_data, Y_data, mask_data)
#define HANDLE_DROPOUT_USE_BITMASK(FuncName) \
if (use_bitmask) { \
LAUNCH_DROPOUT_KERNEL(FuncName, true); \
} else { \
LAUNCH_DROPOUT_KERNEL(FuncName, false); \
}
template <typename T>
void DropoutKernelImpl(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N,
const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, const T* X_data,
T* Y_data, void* mask_data, bool use_bitmask) {
const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / kBlockSize;
const int grid_size =
std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, kBlockSize * kNumUnroll)));
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that
// amount.
const int step_size = kBlockSize * grid_size * kNumUnroll;
const int steps_per_thread = static_cast<int>(CeilDiv(N, step_size));
auto seeds = generator.NextPhiloxSeeds(static_cast<uint64_t>(steps_per_thread * kNumUnroll));
fast_divmod fdm_bits_per_element(kNumBitsPerBitmaskElement);
if (N % kNumUnroll != 0) {
HANDLE_DROPOUT_USE_BITMASK(DropoutKernel);
} else {
HANDLE_DROPOUT_USE_BITMASK(DropoutVectorizedKernel);
}
}
#undef HANDLE_DROPOUT_USE_BITMASK
#undef LAUNCH_DROPOUT_KERNEL
#define SPECIALIZED_DROPOUT_IMPL(T) \
template void DropoutKernelImpl<T>(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N, \
const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, \
const T* X_data, T* Y_data, void* mask_data, bool use_bitmask);
SPECIALIZED_DROPOUT_IMPL(float)
SPECIALIZED_DROPOUT_IMPL(double)
SPECIALIZED_DROPOUT_IMPL(half)
SPECIALIZED_DROPOUT_IMPL(BFloat16)
#undef SPECIALIZED_DROPOUT_IMPL
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/framework/random_generator.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void DropoutKernelImpl(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N,
const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, const T* X_data,
T* Y_data, void* mask_data, bool use_bitmask);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "instance_norm.h"
#include "instance_norm_impl.h"
#include "core/providers/cpu/nn/instance_norm_helper.h"
#include "core/providers/cpu/nn/batch_norm_helper.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
InstanceNormalization, \
kOnnxDomain, \
6, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
InstanceNorm<T>);
REGISTER_KERNEL_TYPED(float)
REGISTER_KERNEL_TYPED(double)
REGISTER_KERNEL_TYPED(MLFloat16)
template <typename T>
InstanceNorm<T>::InstanceNorm(const OpKernelInfo& op_kernel_info)
: RocmKernel(op_kernel_info) {
float tmp_epsilon;
ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
epsilon_ = ClampMiopenBatchNormEpsilon(tmp_epsilon);
}
template <typename T>
Status InstanceNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(X, scale, bias));
const TensorShape& x_shape = X->Shape();
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
auto* y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
const auto* x_data = reinterpret_cast<const HipT*>(X->Data<T>());
const auto* scale_data = reinterpret_cast<const HipT*>(scale->Data<T>());
const auto* bias_data = reinterpret_cast<const HipT*>(bias->Data<T>());
const auto& x_dims = x_shape.GetDims();
const int64_t N = x_dims[0];
const int64_t C = x_dims[1];
const auto one = Consts<HipT>::One;
const auto zero = Consts<HipT>::Zero;
if (N == 1) {
// when N == 1, we can treat it as spatial batch normalization in training
// as the mean/variance would be computed from input
MiopenTensor data_desc;
std::vector<int64_t> new_dims;
BatchNormHelper::NormalizeDims(x_shape, new_dims);
ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, MiopenTensor::GetDataType<HipT>()));
MiopenTensor stats_desc;
ORT_RETURN_IF_ERROR(stats_desc.Set(data_desc, miopenBNSpatial));
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
MiopenHandle(),
miopenBNSpatial,
&one,
&zero,
data_desc,
x_data,
data_desc,
y_data,
stats_desc,
scale_data,
bias_data,
1.0f,
nullptr,
nullptr,
epsilon_,
nullptr,
nullptr));
} else {
// we use miopenBatchNormalizationForwardTraining to compute mean/variance
// so collapsing NC into channel
auto input_count = x_shape.Size(); // N * C * H * W
auto stats_count = x_shape.SizeToDimension(2); // N * C
auto image_size = input_count / stats_count;
MiopenTensor data_desc;
ORT_RETURN_IF_ERROR(data_desc.Set(std::array<int64_t, 4>{1, stats_count, image_size, 1}, MiopenTensor::GetDataType<HipT>()));
MiopenTensor stats_desc;
ORT_RETURN_IF_ERROR(stats_desc.Set(std::array<int64_t, 4>{1, stats_count, 1, 1}, MiopenTensor::GetDataType<HipT>()));
const size_t stats_byte_count = stats_count * sizeof(HipT);
// Mean & Variance are inputs & outputs and must be initialized to zero to work properly
auto mean = GetScratchBuffer<HipT>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(mean.get(), 0, stats_byte_count, Stream()));
auto variance = GetScratchBuffer<HipT>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(variance.get(), 0, stats_byte_count, Stream()));
// We must set the scale & bias inputs to zero as they are inputs to the calculation
auto unused_scale = GetScratchBuffer<HipT>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream()));
auto unused_bias = GetScratchBuffer<HipT>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream()));
// first, compute mean and variance per-instance per-channel using miopenBatchNorm training
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
MiopenHandle(),
miopenBNSpatial,
&one,
&zero,
data_desc,
x_data,
data_desc,
y_data, // use y temporarily, would be rewritten later
stats_desc,
unused_scale.get(),
unused_bias.get(),
1.0f,
mean.get(),
variance.get(),
MIOPEN_BN_MIN_EPSILON,
nullptr,
nullptr));
// Y = scale * (x - mean) / sqrt (variance + epsilon) + B
// X/Y is (N,C,H,W)
// scale/bias is (1,C,1,1)
// mean/stddev is (N,C,1,1)
// NOTE miopenBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
// and it needs to be corrected with (count - 1) / count
fast_divmod fdm_HW(gsl::narrow_cast<int>(image_size));
fast_divmod fdm_C(gsl::narrow_cast<int>(C));
InstanceNormImpl<HipT>(
Stream(),
x_data,
scale_data,
bias_data,
mean.get(),
variance.get(),
(image_size - 1.0) / image_size,
static_cast<double>(epsilon_),
fdm_HW,
fdm_C,
y_data,
input_count);
}
return Status::OK();
}
template <>
Status InstanceNorm<MLFloat16>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
typedef typename ToHipType<MLFloat16>::MappedType HipT;
const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(X, scale, bias));
const TensorShape& x_shape = X->Shape();
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
auto* y_data = reinterpret_cast<HipT*>(Y->MutableData<MLFloat16>());
const auto* x_data = reinterpret_cast<const HipT*>(X->Data<MLFloat16>());
const auto* scale_data = reinterpret_cast<const HipT*>(scale->Data<MLFloat16>());
const auto* bias_data = reinterpret_cast<const HipT*>(bias->Data<MLFloat16>());
const auto& x_dims = x_shape.GetDims();
const int64_t N = x_dims[0];
const int64_t C = x_dims[1];
const auto one = Consts<HipT>::One;
const auto zero = Consts<HipT>::Zero;
if (N == 1) {
// when N == 1, we can treat it as spatial batch normalization in training
// as the mean/variance would be computed from input
MiopenTensor data_desc;
std::vector<int64_t> new_dims;
BatchNormHelper::NormalizeDims(x_shape, new_dims);
ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, MiopenTensor::GetDataType<HipT>()));
MiopenTensor stats_desc;
ORT_RETURN_IF_ERROR(stats_desc.Set(data_desc, miopenBNSpatial));
// For half input data type, alpha, beta, scale, bias need to be float type.
// alpha, beta will be of type float as the Consts struct specialization
// for MLFloat16 type take care of that. Only Convert the scale, bias to float)
auto scale_data_fp32 = GetScratchBuffer<float>(C);
Impl_Cast<HipT, float>(Stream(), scale_data, scale_data_fp32.get(), C);
auto bias_data_fp32 = GetScratchBuffer<float>(C);
Impl_Cast<HipT, float>(Stream(), bias_data, bias_data_fp32.get(), C);
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
MiopenHandle(),
miopenBNSpatial,
&one,
&zero,
data_desc,
x_data,
data_desc,
y_data,
stats_desc,
scale_data_fp32.get(),
bias_data_fp32.get(),
1.0f,
nullptr,
nullptr,
epsilon_,
nullptr,
nullptr));
} else {
// we use miopenBatchNormalizationForwardTraining to compute mean/variance
// so collapsing NC into channel
auto input_count = x_shape.Size(); // N * C * H * W
auto stats_count = x_shape.SizeToDimension(2); // N * C
auto image_size = input_count / stats_count;
MiopenTensor data_desc;
ORT_RETURN_IF_ERROR(data_desc.Set(std::array<int64_t, 4>{1, stats_count, image_size, 1},
MiopenTensor::GetDataType<HipT>()));
// stats_desc needs to be of 'float' type even for float16 input as the "stats" are of float type
MiopenTensor stats_desc;
ORT_RETURN_IF_ERROR(stats_desc.Set(std::array<int64_t, 4>{1, stats_count, 1, 1},
MiopenTensor::GetDataType<float>()));
// For half input data type, we need to allocate some "intermediate"
// float buffers for CuDNN to use.
const size_t stats_byte_count = stats_count * sizeof(float);
// Mean & Variance are inputs & outputs and must be initialized to zero to work properly
auto mean = GetScratchBuffer<float>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(mean.get(), 0, stats_byte_count, Stream()));
auto variance = GetScratchBuffer<float>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(variance.get(), 0, stats_byte_count, Stream()));
// We must set the scale & bias inputs to zero as they are inputs to the calculation
auto unused_scale = GetScratchBuffer<float>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream()));
auto unused_bias = GetScratchBuffer<float>(stats_count);
HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream()));
// first, compute mean and variance per-instance per-channel using miopenBatchNorm training
MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
MiopenHandle(),
miopenBNSpatial,
&one,
&zero,
data_desc,
x_data,
data_desc,
y_data, // use y temporarily, would be rewritten later
stats_desc,
unused_scale.get(),
unused_bias.get(),
1.0f,
mean.get(),
variance.get(),
MIOPEN_BN_MIN_EPSILON,
nullptr,
nullptr));
// Y = scale * (x - mean) / sqrt (variance + epsilon) + B
// X/Y is (N,C,H,W)
// scale/bias is (1,C,1,1)
// mean/stddev is (N,C,1,1)
// NOTE miopenBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
// and it needs to be corrected with (count - 1) / count
fast_divmod fdm_HW(gsl::narrow_cast<int>(image_size));
fast_divmod fdm_C(gsl::narrow_cast<int>(C));
// The InstanceNormImpl kernel handles the mean/variance in float32, so no casting required here
InstanceNormImpl<HipT, float>(
Stream(),
x_data,
scale_data,
bias_data,
mean.get(),
variance.get(),
(image_size - 1.0) / image_size,
static_cast<double>(epsilon_),
fdm_HW,
fdm_C,
y_data,
input_count);
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class InstanceNorm final : public RocmKernel {
public:
InstanceNorm(const OpKernelInfo& op_kernel_info);
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
private:
double epsilon_;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "instance_norm_impl.h"
namespace onnxruntime {
namespace rocm {
template <typename T1, typename T2>
__global__ void _InstanceNormKernel(
const T1* __restrict__ input_data,
const T1* __restrict__ scale,
const T1* __restrict__ bias,
const T2* __restrict__ mean,
const T2* __restrict__ variance,
const double variance_correction,
const double epsilon,
const fast_divmod fdm_HW,
const fast_divmod fdm_C,
T1* __restrict__ output_data,
const HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int nc = fdm_HW.div(id);
int n, c;
fdm_C.divmod(nc, n, c);
// Y = scale * (x - mean) / sqrt (std * std + epsilon) + B
output_data[id] = scale[c] * (input_data[id] - (T1)mean[nc]) / _Sqrt((T1)variance[nc] * (T1)variance_correction + (T1)epsilon) + bias[c];
}
template <typename T1, typename T2>
void InstanceNormImpl(
hipStream_t stream,
const T1* input_data,
const T1* scale,
const T1* bias,
const T2* mean,
const T2* variance,
const double variance_correction,
const double epsilon,
const fast_divmod& fdm_HW,
const fast_divmod& fdm_C,
T1* output_data,
size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(_InstanceNormKernel<T1, T2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_data, scale, bias, mean, variance, variance_correction, epsilon, fdm_HW, fdm_C, output_data, (HIP_LONG)N);
}
#define SPECIALIZED_IMPL(T1, T2) \
template void InstanceNormImpl<T1, T2>(hipStream_t stream, const T1* input_data, const T1* scale, const T1* bias, const T2* mean, const T2* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T1* output_data, size_t count);
SPECIALIZED_IMPL(float, float)
SPECIALIZED_IMPL(double, double)
// When the input data type is float16, the means and variances will flow in as float32 (special case)
SPECIALIZED_IMPL(half, float)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/fast_divmod.h"
namespace onnxruntime {
namespace rocm {
template <typename T1, typename T2>
void InstanceNormImpl(
hipStream_t stream,
const T1* input_data,
const T1* scale,
const T1* bias,
const T2* mean,
const T2* variance,
const double variance_correction,
const double epsilon,
const fast_divmod& fdm_HW,
const fast_divmod& fdm_C,
T1* output_data,
size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/nn/layer_norm.h"
#include "core/providers/rocm/nn/layer_norm_impl.h"
#include "core/providers/rocm/rocm_common.h"
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_TYPED(T, U) \
ONNX_OPERATOR_TYPED_KERNEL_EX(LayerNormalization, kOnnxDomain, 17, T, kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("U", DataTypeImpl::GetTensorType<U>()), \
LayerNorm<T, U, T, false>);
REGISTER_KERNEL_TYPED(float, float)
REGISTER_KERNEL_TYPED(double, float)
REGISTER_KERNEL_TYPED(MLFloat16, float)
REGISTER_KERNEL_TYPED(BFloat16, float)
template <typename T, typename U, typename V, bool simplified>
LayerNorm<T, U, V, simplified>::LayerNorm(const OpKernelInfo& op_kernel_info) : RocmKernel(op_kernel_info) {
ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK());
float tmp_epsilon;
ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
epsilon_ = tmp_epsilon;
}
template <typename T, typename U, typename V, bool simplified>
Status LayerNorm<T, U, V, simplified>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToHipType<T>::MappedType HipT;
typedef typename ToHipType<U>::MappedType CudaU;
typedef typename ToHipType<V>::MappedType CudaV;
// Inputs
const Tensor* X = ctx->Input<Tensor>(0);
const Tensor* scale = ctx->Input<Tensor>(1);
const Tensor* bias = ctx->Input<Tensor>(2);
auto X_data = reinterpret_cast<const HipT*>(X->Data<T>());
auto scale_data = reinterpret_cast<const CudaV*>(scale->Data<V>());
auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast<const CudaV*>(bias->Data<V>());
const TensorShape& x_shape = X->Shape();
const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
int n1 = gsl::narrow<int>(x_shape.SizeToDimension(axis));
int n2 = gsl::narrow<int>(x_shape.SizeFromDimension(axis));
const auto scale_size = scale->Shape().Size();
const auto bias_size = (bias_data) ? bias->Shape().Size() : 0;
if (n2 == 1 || scale_size != n2 || (bias_data && bias_size != n2)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Size of X.shape()[axis:] == ", n2,
". Size of scale and bias (if provided) must match this "
"and the size must not be 1. Got scale size of ",
scale_size, " and bias size of ", bias_size);
}
// Outputs
Tensor* Y = ctx->Output(0, x_shape);
auto Y_data = reinterpret_cast<CudaV*>(Y->MutableData<V>());
// Mean and variance
std::vector<int64_t> mean_inv_std_var_dim;
for (int i = 0; i < static_cast<int>(x_shape.NumDimensions()); ++i) {
if (i < axis) {
mean_inv_std_var_dim.emplace_back(x_shape.GetDims()[i]);
} else {
mean_inv_std_var_dim.emplace_back(1);
}
}
int output_index = 1;
CudaU* mean_data = nullptr;
if (!simplified) {
Tensor* mean = ctx->Output(output_index++, TensorShape(mean_inv_std_var_dim));
if (mean != nullptr) {
mean_data = reinterpret_cast<CudaU*>(mean->MutableData<U>());
}
}
CudaU* inv_var_data = nullptr;
Tensor* var = ctx->Output(output_index, TensorShape(mean_inv_std_var_dim));
if (var != nullptr) {
inv_var_data = reinterpret_cast<CudaU*>(var->MutableData<U>());
}
if (x_shape.Size() == 0) {
return Status::OK();
}
HostApplyLayerNorm<HipT, CudaU, CudaV, simplified>(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data,
X_data, n1, n2, epsilon_, scale_data, bias_data);
return Status::OK();
}
#if !defined(DISABLE_CONTRIB_OPS)
#define LAYERNORM_IMPL(T, U, V, simplified) \
template class LayerNorm<T, U, V, simplified>;
// contrib op usage
LAYERNORM_IMPL(float, float, float, false)
LAYERNORM_IMPL(double, double, double, false)
LAYERNORM_IMPL(MLFloat16, float, MLFloat16, false)
LAYERNORM_IMPL(float, float, MLFloat16, false)
LAYERNORM_IMPL(MLFloat16, float, float, false)
LAYERNORM_IMPL(BFloat16, float, BFloat16, false)
LAYERNORM_IMPL(float, float, float, true)
LAYERNORM_IMPL(double, double, double, true)
LAYERNORM_IMPL(MLFloat16, float, MLFloat16, true)
LAYERNORM_IMPL(float, float, MLFloat16, true)
LAYERNORM_IMPL(MLFloat16, float, float, true)
LAYERNORM_IMPL(BFloat16, float, BFloat16, true)
#endif
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
using namespace onnxruntime::rocm;
// NOTE: This was originally a contrib op with 3 type constraints. The ONNX spec merges 'T' and 'V'.
// the kernel is templatized on all three for backwards compatibility, but in ONNX usage T == V.
template <typename T, typename U, typename V, bool simplified>
class LayerNorm final : public RocmKernel {
public:
LayerNorm(const OpKernelInfo& op_kernel_info);
Status ComputeInternal(OpKernelContext* ctx) const override;
private:
int64_t axis_;
double epsilon_;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#include "core/providers/rocm/cu_inc/common.cuh"
#include "layer_norm_impl.h"
namespace onnxruntime {
namespace rocm {
using namespace onnxruntime::rocm;
template <typename U, bool simplified>
__device__ void cuWelfordOnlineSum(
const U curr,
U& mu,
U& sigma2,
U& count) {
count = count + U(1);
U delta = curr - mu;
U lmean = mu + delta / count;
mu = lmean;
if (simplified) {
sigma2 = sigma2 + curr * curr;
} else {
U delta2 = curr - lmean;
sigma2 = sigma2 + delta * delta2;
}
}
template <typename U, bool simplified>
__device__ void cuChanOnlineSum(
const U muB,
const U sigma2B,
const U countB,
U& mu,
U& sigma2,
U& count) {
U delta = muB - mu;
U nA = count;
U nB = countB;
count = count + countB;
U nX = count;
if (nX > U(0)) {
nA = nA / nX;
nB = nB / nX;
mu = nA * mu + nB * muB;
if (simplified) {
sigma2 = sigma2 + sigma2B;
} else {
sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
}
} else {
mu = U(0);
sigma2 = U(0);
}
}
template <typename T, typename U, bool simplified>
__device__ void cuWelfordMuSigma2(
const T* __restrict__ vals,
const int n1,
const int n2,
const int i1,
U& mu,
U& sigma2,
U* buf) {
// Assumptions:
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
// compute variance and mean over n2
U count = U(0);
mu = U(0);
sigma2 = U(0);
if (i1 < n1) {
// one warp normalizes one n1 index,
// synchronization is implicit
// initialize with standard Welford algorithm
const int numx = blockDim.x * blockDim.y;
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
const T* lvals = vals + i1 * n2;
int l = 4 * thrx;
for (; l + 3 < n2; l += 4 * numx) {
for (int k = 0; k < 4; ++k) {
U curr = static_cast<U>(lvals[l + k]);
cuWelfordOnlineSum<U, simplified>(curr, mu, sigma2, count);
}
}
for (; l < n2; ++l) {
U curr = static_cast<U>(lvals[l]);
cuWelfordOnlineSum<U, simplified>(curr, mu, sigma2, count);
}
// intra-warp reductions
#pragma unroll
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
U muB = WARP_SHFL_DOWN(mu, stride);
U countB = WARP_SHFL_DOWN(count, stride);
U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
cuChanOnlineSum<U, simplified>(muB, sigma2B, countB, mu, sigma2, count);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
if (blockDim.y > 1) {
U* ubuf = (U*)buf;
U* ibuf = (U*)(ubuf + blockDim.y);
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
// upper half of warps write to shared
if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
const int wrt_y = threadIdx.y - offset;
ubuf[2 * wrt_y] = mu;
ubuf[2 * wrt_y + 1] = sigma2;
ibuf[wrt_y] = count;
}
__syncthreads();
// lower half merges
if (threadIdx.x == 0 && threadIdx.y < offset) {
U muB = ubuf[2 * threadIdx.y];
U sigma2B = ubuf[2 * threadIdx.y + 1];
U countB = ibuf[threadIdx.y];
cuChanOnlineSum<U, simplified>(muB, sigma2B, countB, mu, sigma2, count);
}
__syncthreads();
}
// threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
if (threadIdx.x == 0 && threadIdx.y == 0) {
ubuf[0] = mu;
ubuf[1] = sigma2;
}
__syncthreads();
mu = ubuf[0];
sigma2 = ubuf[1] / U(n2);
// don't care about final value of count, we know count == n2
} else {
mu = WARP_SHFL(mu, 0);
sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
}
}
}
template <bool simplified>
__device__ void cuWelfordMuSigma2(
const half* __restrict__ vals,
const int n1,
const int n2,
const int i1,
float& mu,
float& sigma2,
float* buf) {
// Assumptions:
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
// compute variance and mean over n2
float count = 0.0f;
mu = float(0);
sigma2 = float(0);
if (i1 < n1) {
// one warp normalizes one n1 index,
// synchronization is implicit
// initialize with standard Welford algorithm
const int numx = blockDim.x * blockDim.y;
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
const half* lvals = vals + i1 * n2;
int l = 8 * thrx;
if ((((size_t)lvals) & 3) != 0) {
// 16 bit alignment
// first thread consumes first point
if (thrx == 0) {
float curr = static_cast<float>(lvals[0]);
cuWelfordOnlineSum<float, simplified>(curr, mu, sigma2, count);
}
++l;
}
// at this point, lvals[l] are 32 bit aligned for all threads.
for (; l + 7 < n2; l += 8 * numx) {
for (int k = 0; k < 8; k += 2) {
float2 curr = __half22float2(*((__half2*)(lvals + l + k)));
cuWelfordOnlineSum<float, simplified>(curr.x, mu, sigma2, count);
cuWelfordOnlineSum<float, simplified>(curr.y, mu, sigma2, count);
}
}
for (; l < n2; ++l) {
float curr = static_cast<float>(lvals[l]);
cuWelfordOnlineSum<float, simplified>(curr, mu, sigma2, count);
}
// intra-warp reductions
#pragma unroll
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
float muB = WARP_SHFL_DOWN(mu, stride);
float countB = WARP_SHFL_DOWN(count, stride);
float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
cuChanOnlineSum<float, simplified>(muB, sigma2B, countB, mu, sigma2, count);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
if (blockDim.y > 1) {
float* ubuf = (float*)buf;
float* ibuf = (float*)(ubuf + blockDim.y);
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
// upper half of warps write to shared
if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
const int wrt_y = threadIdx.y - offset;
ubuf[2 * wrt_y] = mu;
ubuf[2 * wrt_y + 1] = sigma2;
ibuf[wrt_y] = count;
}
__syncthreads();
// lower half merges
if (threadIdx.x == 0 && threadIdx.y < offset) {
float muB = ubuf[2 * threadIdx.y];
float sigma2B = ubuf[2 * threadIdx.y + 1];
float countB = ibuf[threadIdx.y];
cuChanOnlineSum<float, simplified>(muB, sigma2B, countB, mu, sigma2, count);
}
__syncthreads();
}
// threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
if (threadIdx.x == 0 && threadIdx.y == 0) {
ubuf[0] = mu;
ubuf[1] = sigma2;
}
__syncthreads();
mu = ubuf[0];
sigma2 = ubuf[1] / float(n2);
// don't care about final value of count, we know count == n2
} else {
mu = WARP_SHFL(mu, 0);
sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
}
}
}
template <typename U>
__device__ U rsqrt(U v) {
return U(1) / sqrt(v);
}
template <>
__device__ float rsqrt(float v) {
return rsqrtf(v);
}
template <>
__device__ double rsqrt(double v) {
return rsqrt(v);
}
namespace {
// This is the un-specialized struct. Note that we prevent instantiation of this
// struct by putting an undefined symbol in the function body so it won't compile.
// template <typename T>
// struct SharedMemory
// {
// // Ensure that we won't compile any un-specialized types
// __device__ T *getPointer()
// {
// extern __device__ void error(void);
// error();
// return NULL;
// }
// };
// https://github.com/NVIDIA/apex/issues/246
template <typename T>
struct SharedMemory;
template <>
struct SharedMemory<float> {
__device__ float* getPointer() {
extern __shared__ float s_float[];
return s_float;
}
};
template <>
struct SharedMemory<double> {
__device__ double* getPointer() {
extern __shared__ double s_double[];
return s_double;
}
};
} // namespace
template <typename T, typename U, typename V, bool simplified>
__global__ void cuApplyLayerNorm(
V* __restrict__ output_vals,
U* __restrict__ mean,
U* __restrict__ inv_std_dev,
const T* __restrict__ vals,
const int n1,
const int n2,
const U epsilon,
const V* __restrict__ gamma,
const V* __restrict__ beta) {
// Assumptions:
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensors are contiguous
//
for (int i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
SharedMemory<U> shared;
U* buf = shared.getPointer();
U mu, sigma2;
cuWelfordMuSigma2<T, U, simplified>(vals, n1, n2, i1, mu, sigma2, buf);
const T* lvals = vals + i1 * n2;
V* ovals = output_vals + i1 * n2;
U c_inv_std_dev = rsqrt(sigma2 + epsilon);
const int numx = blockDim.x * blockDim.y;
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
for (int i = thrx; i < n2; i += numx) {
U curr = static_cast<U>(lvals[i]);
V gamma_i = (gamma != NULL) ? gamma[i] : (V)1;
V beta_i = (beta != NULL) ? beta[i] : (V)0;
if (simplified) {
ovals[i] = gamma_i * static_cast<V>(c_inv_std_dev * curr);
} else {
ovals[i] = gamma_i * static_cast<V>(c_inv_std_dev * (curr - mu)) + beta_i;
}
}
if (threadIdx.x == 0 && threadIdx.y == 0) {
if (mean != nullptr) mean[i1] = mu;
if (inv_std_dev != nullptr) inv_std_dev[i1] = c_inv_std_dev;
}
}
}
template <typename T, typename U, typename V, bool simplified>
void HostApplyLayerNorm(
const hipDeviceProp_t& prop,
hipStream_t stream,
V* output,
U* mean,
U* inv_std_dev,
const T* input,
int n1,
int n2,
double epsilon,
const V* gamma,
const V* beta) {
const int maxGridY = prop.maxGridSize[1];
const int warp_size = prop.warpSize;
ORT_ENFORCE(warp_size == GPU_WARP_SIZE_HOST);
dim3 threads(warp_size, 4, 1);
#ifdef __HIP_PLATFORM_HCC__
// Optimization for ROCm MI100
threads.y = 1;
#endif
const dim3 blocks(1, std::min<unsigned int>(n1, maxGridY), 1);
int nshared =
threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
hipLaunchKernelGGL(HIP_KERNEL_NAME(cuApplyLayerNorm<T, U, V, simplified>), blocks, threads, nshared, stream,
output,
mean,
inv_std_dev,
input,
n1, n2,
U(epsilon),
gamma, beta);
}
#define LAYERNORM_LINEAR_IMPL(T, U, V, simplified) \
template void HostApplyLayerNorm<T, U, V, simplified>(const hipDeviceProp_t& prop, hipStream_t stream, V* output, \
U* mean, U* inv_std_dev, const T* input, int n1, int n2, \
double epsilon, const V* gamma, const V* beta);
LAYERNORM_LINEAR_IMPL(float, float, float, true)
LAYERNORM_LINEAR_IMPL(half, float, half, true)
LAYERNORM_LINEAR_IMPL(double, double, double, true)
LAYERNORM_LINEAR_IMPL(float, float, half, true)
LAYERNORM_LINEAR_IMPL(half, float, float, true)
LAYERNORM_LINEAR_IMPL(float, float, float, false)
LAYERNORM_LINEAR_IMPL(half, float, half, false)
LAYERNORM_LINEAR_IMPL(double, double, double, false)
LAYERNORM_LINEAR_IMPL(double, float, double, false)
LAYERNORM_LINEAR_IMPL(float, float, half, false)
LAYERNORM_LINEAR_IMPL(half, float, float, false)
LAYERNORM_LINEAR_IMPL(BFloat16, float, BFloat16, true)
LAYERNORM_LINEAR_IMPL(BFloat16, float, BFloat16, false)
} // namespace rocm
} // namespace onnxruntime
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#pragma once
#include "core/providers/rocm/rocm_common.h"
namespace onnxruntime {
namespace rocm {
template <typename T, typename U, typename V, bool simplified>
void HostApplyLayerNorm(
const hipDeviceProp_t& prop,
hipStream_t stream,
V* output,
U* mean,
U* invvar,
const T* input,
int n1,
int n2,
double epsilon,
const V* gamma,
const V* beta);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "lrn.h"
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
LRN, \
kOnnxDomain, \
START_VER, \
END_VER, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
LRN<T>);
#define REGISTER_KERNEL_TYPED(VER, T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
LRN, \
kOnnxDomain, \
VER, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
LRN<T>);
REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float)
REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double)
REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16)
REGISTER_KERNEL_TYPED(13, float)
REGISTER_KERNEL_TYPED(13, double)
REGISTER_KERNEL_TYPED(13, MLFloat16)
template <typename T>
LRN<T>::LRN(const OpKernelInfo& info) : RocmKernel(info) {
int64_t size;
ORT_ENFORCE(info.GetAttr<int64_t>("size", &size).IsOK());
ORT_ENFORCE(size > 0);
ORT_ENFORCE(size % 2 == 1);
float alpha;
float beta;
ORT_ENFORCE(info.GetAttr<float>("alpha", &alpha).IsOK());
ORT_ENFORCE(alpha > 0.0f);
ORT_ENFORCE(info.GetAttr<float>("beta", &beta).IsOK());
ORT_ENFORCE(beta > 0.0f);
float bias = info.GetAttrOrDefault<float>("bias", 1.0f);
ORT_ENFORCE(norm_desc_.Set(
gsl::narrow_cast<uint32_t>(size),
static_cast<double>(alpha),
static_cast<double>(beta),
static_cast<double>(bias))
.IsOK());
}
template <typename T>
Status LRN<T>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = context->Input<Tensor>(0);
auto rank = X->Shape().NumDimensions();
if (rank != 4 && rank != 5)
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "miopen LRN only supports 4D or 5D input");
Tensor* Y = context->Output(0, X->Shape());
MiopenTensor x_tensor;
ORT_RETURN_IF_ERROR(x_tensor.Set(X->Shape().GetDims(), MiopenTensor::GetDataType<HipT>()));
const auto one = Consts<HipT>::One;
const auto zero = Consts<HipT>::Zero;
MIOPEN_RETURN_IF_ERROR(LRNCrossChannelForwardHelper(
MiopenHandle(),
norm_desc_,
miopenLRNCrossChannel,
&one,
x_tensor,
reinterpret_cast<const HipT*>(X->Data<T>()),
&zero,
x_tensor,
reinterpret_cast<HipT*>(Y->MutableData<T>())));
return Status::OK();
}
MiopenLRNDescriptor::MiopenLRNDescriptor() : desc_(nullptr) {
}
MiopenLRNDescriptor::~MiopenLRNDescriptor() {
if (desc_) {
miopenDestroyLRNDescriptor(desc_);
desc_ = nullptr;
}
}
Status MiopenLRNDescriptor::Set(uint32_t N, double alpha, double beta, double K) {
if (!desc_)
MIOPEN_RETURN_IF_ERROR(miopenCreateLRNDescriptor(&desc_));
MIOPEN_RETURN_IF_ERROR(SetLRNDescriptorHelper(desc_, N, alpha, beta, K));
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
namespace onnxruntime {
namespace rocm {
class MiopenLRNDescriptor final {
public:
MiopenLRNDescriptor();
~MiopenLRNDescriptor();
Status Set(uint32_t N, double alpha, double beta, double K);
operator miopenLRNDescriptor_t() const { return desc_; }
private:
miopenLRNDescriptor_t desc_;
};
template <typename T>
class LRN : public RocmKernel {
public:
LRN(const OpKernelInfo& info);
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
private:
MiopenLRNDescriptor norm_desc_;
};
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment