add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.cu
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/math/variadic_elementwise_ops_impl.h"
+
+#include "core/providers/rocm/cu_inc/variadic_elementwise_impl.cuh"
+#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
+#include "core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh"
+#include "core/providers/rocm/math/variadic_elementwise_ops_tags.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T, typename VariadicElementwiseOpTag>
+struct VariadicElementwiseOpTraits;
+
+#define DEFINE_TRAITS(VariadicElementwiseOpTag, ImplName)           \
+  template <typename T>                                             \
+  struct VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag> { \
+    using ScalarComputeFunctor = OP_##ImplName<T, T, T>;            \
+                                                                    \
+    static void ComputeFn(                                          \
+        hipStream_t stream,                                        \
+        int32_t output_rank_or_simple_broadcast,                    \
+        const TArray<int64_t>* lhs_padded_strides,                  \
+        const T* lhs_data,                                          \
+        const TArray<int64_t>* rhs_padded_strides,                  \
+        const T* rhs_data,                                          \
+        const TArray<fast_divmod>* fdm_output_strides,              \
+        const fast_divmod& fdm_H,                                   \
+        const fast_divmod& fdm_C,                                   \
+        T* output_data,                                             \
+        size_t count) {                                             \
+      Impl_##ImplName(                                              \
+          stream,                                                   \
+          output_rank_or_simple_broadcast,                          \
+          lhs_padded_strides,                                       \
+          lhs_data,                                                 \
+          rhs_padded_strides,                                       \
+          rhs_data,                                                 \
+          fdm_output_strides,                                       \
+          fdm_H,                                                    \
+          fdm_C,                                                    \
+          output_data,                                              \
+          count);                                                   \
+    }                                                               \
+  };
+
+DEFINE_TRAITS(variadic_elementwise_ops::Sum, Add)
+DEFINE_TRAITS(variadic_elementwise_ops::Min, Min)
+DEFINE_TRAITS(variadic_elementwise_ops::Max, Max)
+
+#undef DEFINE_TRAITS
+
+template <typename T, typename VariadicElementwiseOpTag>
+void Impl_General(
+    hipStream_t stream,
+    int32_t output_rank_or_simple_broadcast,
+    const TArray<int64_t>* lhs_padded_strides,
+    const T* lhs_data,
+    const TArray<int64_t>* rhs_padded_strides,
+    const T* rhs_data,
+    const TArray<fast_divmod>* fdm_output_strides,
+    const fast_divmod& fdm_H,
+    const fast_divmod& fdm_C,
+    T* output_data,
+    size_t count) {
+  VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag>::ComputeFn(
+      stream,
+      output_rank_or_simple_broadcast,
+      lhs_padded_strides,
+      lhs_data,
+      rhs_padded_strides,
+      rhs_data,
+      fdm_output_strides,
+      fdm_H,
+      fdm_C,
+      output_data,
+      count);
+}
+
+template <typename T, typename VariadicElementwiseOpTag>
+void Impl_NoBroadcastInputBatch(
+    hipStream_t stream,
+    InputBatchArray<T> input_data_batch,
+    T* output_data,
+    size_t count) {
+  VariadicElementWiseNoBroadcastInputBatchImpl<
+      T, typename VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag>::ScalarComputeFunctor,
+      k_max_input_batch_size>(
+      stream,
+      typename VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag>::ScalarComputeFunctor{},
+      count,
+      input_data_batch,
+      output_data);
+}
+
+#define SPECIALIZE_IMPL(T, VariadicElementwiseOpTag)                     \
+  template void Impl_General<T, VariadicElementwiseOpTag>(               \
+      hipStream_t stream,                                               \
+      int32_t output_rank_or_simple_broadcast,                           \
+      const TArray<int64_t>* lhs_padded_strides,                         \
+      const T* lhs_data,                                                 \
+      const TArray<int64_t>* rhs_padded_strides,                         \
+      const T* rhs_data,                                                 \
+      const TArray<fast_divmod>* fdm_output_strides,                     \
+      const fast_divmod& fdm_H,                                          \
+      const fast_divmod& fdm_C,                                          \
+      T* output_data,                                                    \
+      size_t count);                                                     \
+                                                                         \
+  template void Impl_NoBroadcastInputBatch<T, VariadicElementwiseOpTag>( \
+      hipStream_t stream,                                               \
+      InputBatchArray<T> input_data_batch,                               \
+      T * output_data,                                                   \
+      size_t count);
+
+// the postfix means the types supported by the op:
+// B: uint8_t
+// W: uint16_t
+// U: uint32_t
+// Z: uint64_t
+// C: int8_t
+// S: int16_t
+// I: int32_t
+// L: int64_t
+// H: float16
+// F: float
+// D: double
+// O: bool
+
+#define SPECIALIZE_IMPL_HFD(VariadicElementwiseOpTag) \
+  SPECIALIZE_IMPL(half, VariadicElementwiseOpTag)     \
+  SPECIALIZE_IMPL(float, VariadicElementwiseOpTag)    \
+  SPECIALIZE_IMPL(double, VariadicElementwiseOpTag)   \
+  SPECIALIZE_IMPL(BFloat16, VariadicElementwiseOpTag)
+
+#define SPECIALIZE_IMPL_UZILHFD(VariadicElementwiseOpTag) \
+  SPECIALIZE_IMPL(uint32_t, VariadicElementwiseOpTag)     \
+  SPECIALIZE_IMPL(uint64_t, VariadicElementwiseOpTag)     \
+  SPECIALIZE_IMPL(int32_t, VariadicElementwiseOpTag)      \
+  SPECIALIZE_IMPL(int64_t, VariadicElementwiseOpTag)      \
+  SPECIALIZE_IMPL_HFD(VariadicElementwiseOpTag)
+
+SPECIALIZE_IMPL_HFD(variadic_elementwise_ops::Sum)
+SPECIALIZE_IMPL_UZILHFD(variadic_elementwise_ops::Min)
+SPECIALIZE_IMPL_UZILHFD(variadic_elementwise_ops::Max)
+
+#undef SPECIALIZE_IMPL_UZILHFD
+#undef SPECIALIZE_IMPL_HFD
+#undef SPECIALIZE_IMPL
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T, typename VariadicElementwiseOpTag>
+void Impl_General(
+    hipStream_t stream,
+    int32_t output_rank_or_simple_broadcast,
+    const TArray<int64_t>* lhs_padded_strides,
+    const T* lhs_data,
+    const TArray<int64_t>* rhs_padded_strides,
+    const T* rhs_data,
+    const TArray<fast_divmod>* fdm_output_strides,
+    const fast_divmod& fdm_H,
+    const fast_divmod& fdm_C,
+    T* output_data,
+    size_t count);
+
+constexpr int32_t k_max_input_batch_size = 8;
+
+template <typename T>
+using InputBatchArray = TArray<const T*, k_max_input_batch_size>;
+
+template <typename T, typename VariadicElementwiseOpTag>
+void Impl_NoBroadcastInputBatch(
+    hipStream_t stream,
+    InputBatchArray<T> input_data_batch,
+    T* output_data,
+    size_t count);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_tags.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_tags.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+namespace rocm {
+namespace variadic_elementwise_ops {
+struct Sum {};
+struct Min {};
+struct Max {};
+}  // namespace variadic_elementwise_ops
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/multi_tensor/common.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/multi_tensor/common.cuh
+//
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// NVIDIA/apex is licensed under the
+// BSD 3 - Clause "New" or "Revised" License
+//
+
+/* Modifications Copyright (c) Microsoft. */
+
+#pragma once
+#include <vector>
+
+#include "core/common/common.h"
+#include "core/common/gsl.h"
+
+namespace onnxruntime {
+namespace rocm {
+// initial reference from:
+// https://github.com/NVIDIA/apex/blob/5b71d3695bf39efcdcda9dff5be2f70314b8f091/csrc/multi_tensor_apply.cuh#L15
+// further experiment to get the number below. The larger the better, but if too large, it won't fit into GPU stack.
+constexpr int ACTUAL_TENSOR_GROUP_SIZE[8] = {1, 1, 2, 3, 4, 5, 6, 7};
+constexpr int MAX_BLOCK_COUNTS[8] = {256, 320, 320, 320, 320, 288, 288, 256};
+constexpr int MAX_TENSOR_GROUP_COUNTS[8] = {1, 96, 64, 32, 32, 32, 32, 32};
+constexpr int MAX_BLOCK_THREAD_COUNTS[8] = {256, 512, 512, 512, 512, 512, 512, 512};
+
+// TensorGroupSize is the number of parallel tensors. For element-wise
+// operators such as Relu, it should be 1. For two-operand operators such as
+// element-wise addition, it should be 2. The value 0 is reserved for implementing
+// kernels to handle a single large tensor.
+template <int TensorGroupSize>
+struct ChunkGroup {
+  // Number of chunks in this ChunkGroup.
+  // It's the effective size of block_index_to_tensor_group_index and
+  // block_index_to_chunk_start_index.
+  // The i-th chunk starts at the block_index_to_chunk_start_index[i]-th
+  // element in the block_index_to_tensor_group_index[i]-th tensor.
+  int chunk_count = 0;
+  // Max number of elements in each chunk in this ChunkGroup.
+  // It's an upper bound because chunks locating in the end of tensors
+  // are not always full. For example, if we split a 7-element vector into
+  // two 4-element chunks, the second chunk may contain only 3 actual values.
+  int chunk_size = 0;
+  // blkIdx.x block processes chunks in block_index_to_tensor_group_index[blkIdx.x]-th
+  // tensor group. Each chunk starts from block_index_to_chunk_start_index[blkIdx.x]-th
+  // element until reaching the end of this chunk or the end of the whole tensor.
+  //
+  // Let i = block_index_to_tensor_group_index[blkIdx.x]
+  //     n = tensor_sizes[i]
+  //     b = block_index_to_chunk_start_index[blkIdx.x]
+  //     e = min(b + chunk_size, n)
+  // The valid index range for blockIdx.x is defined by the following equation.
+  //     b <= valid index < e
+  int block_index_to_tensor_group_index[MAX_BLOCK_COUNTS[TensorGroupSize]];
+  int block_index_to_chunk_start_index[MAX_BLOCK_COUNTS[TensorGroupSize]];
+  int tensor_sizes[MAX_TENSOR_GROUP_COUNTS[TensorGroupSize]];
+  // The addresses of tensors where the chunks are extracted from.
+  // 1. tensor_ptrs[0][i], ..., tensor_ptrs[TensorGroupSize-1][i] are
+  //    the tensors' pointers in the i-th group.
+  // 2. All tensors in the i-th group have the same size, tensor_sizes[i].
+  void* tensor_ptrs[ACTUAL_TENSOR_GROUP_SIZE[TensorGroupSize]][MAX_TENSOR_GROUP_COUNTS[TensorGroupSize]];
+  // Max number of GPU blocks to process the chunks in this chunk group.
+  const static int max_block_count = MAX_BLOCK_COUNTS[TensorGroupSize];
+  // Max number of tensor groups in this chunk group.
+  const static int max_tensor_group_count = MAX_TENSOR_GROUP_COUNTS[TensorGroupSize];
+  // The suggested number of threads to launch per GPU block.
+  const static int thread_count_per_block = MAX_BLOCK_THREAD_COUNTS[TensorGroupSize];
+};
+
+template <int TensorGroupSize>
+int compute_max_tensor_size_per_launch(int element_count_per_thread) {
+  constexpr int block_count =
+      ChunkGroup<TensorGroupSize>::max_block_count;
+  constexpr int thread_count_per_block =
+      ChunkGroup<TensorGroupSize>::thread_count_per_block;
+  return block_count * thread_count_per_block * element_count_per_thread;
+}
+
+template <int TensorGroupSize, typename TMultiTensorFunctor, typename... TFunctorParams>
+void launch_multi_tensor_functor(
+    hipStream_t stream,
+    const int chunk_size,
+    gsl::span<int> tensor_sizes,
+    gsl::span<std::vector<void*>> grouped_tensor_pointers,
+    TMultiTensorFunctor multipleTensorKernel,
+    TFunctorParams&&... kernelParams) {
+  // Check if 32-bit integer is enough.
+  ORT_ENFORCE(tensor_sizes.size() > 0);
+  ORT_ENFORCE(tensor_sizes.size() < static_cast<size_t>(INT_MAX));
+  ORT_ENFORCE(grouped_tensor_pointers.size() > 0);
+  ORT_ENFORCE(grouped_tensor_pointers.size() < static_cast<size_t>(INT_MAX));
+  ORT_ENFORCE(chunk_size > 0);
+  // Number of groups, for example, the number of updated weight tensors in Lamb optimizer.
+  const int group_count = static_cast<int>(grouped_tensor_pointers.size());
+  // Tensor count per group.
+  const int group_size = static_cast<int>(grouped_tensor_pointers[0].size());
+  int tensor_group_index = 0;
+  int block_index = 0;
+
+  ORT_ENFORCE(grouped_tensor_pointers.size() == tensor_sizes.size());
+  ORT_ENFORCE(group_size == ACTUAL_TENSOR_GROUP_SIZE[TensorGroupSize]);
+  for (int i = 0; i < group_count; ++i) {
+    ORT_ENFORCE(grouped_tensor_pointers[i].size() == static_cast<size_t>(group_size));
+  }
+
+  // Handle multiple tensors per ROCM kernel call.
+  ChunkGroup<TensorGroupSize> chunk_group;
+  for (int i = 0; i < group_count; ++i) {
+    // Add pointers to one group of tensors into chunk_group.
+    for (int j = 0; j < group_size; ++j) {
+      chunk_group.tensor_ptrs[j][tensor_group_index] = grouped_tensor_pointers[i][j];
+    }
+
+    // Assuming that all tensors' shapes are the same, we just record w's size.
+    chunk_group.tensor_sizes[tensor_group_index] = tensor_sizes[i];
+    chunk_group.chunk_size = chunk_size;
+
+    const int chunk_count = (tensor_sizes[i] + chunk_size - 1) / chunk_size;
+
+    // Process all chunks in this tensor group.
+    for (int chunk_index = 0; chunk_index < chunk_count; ++chunk_index) {
+      chunk_group.block_index_to_tensor_group_index[block_index] = tensor_group_index;
+      chunk_group.block_index_to_chunk_start_index[block_index] = chunk_index * chunk_size;
+      // After ++block_index, block_index becomes the count of chunks in chunk_group.
+      ++block_index;
+      chunk_group.chunk_count = block_index;
+
+      if (block_index == chunk_group.max_block_count) {
+        multipleTensorKernel(stream, chunk_group, std::forward<TFunctorParams>(kernelParams)...);
+        block_index = 0;
+      }
+    }
+
+    // After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group.
+    ++tensor_group_index;
+    if (tensor_group_index == chunk_group.max_tensor_group_count) {
+      multipleTensorKernel(stream, chunk_group, std::forward<TFunctorParams>(kernelParams)...);
+      block_index = 0;
+      tensor_group_index = 0;
+    }
+  }
+
+  // This round of processing tensor group is finished.
+  // All the groups remain in chunk group should be processed right now.
+  if (block_index != 0) {
+    multipleTensorKernel(stream, chunk_group, std::forward<TFunctorParams>(kernelParams)...);
+    block_index = 0;
+    tensor_group_index = 0;
+  }
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "batch_norm.h"
+#include "core/providers/common.h"
+#include "core/providers/rocm/miopen_common.h"
+#include "core/providers/cpu/nn/batch_norm_helper.h"
+#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
+
+using namespace std;
+namespace onnxruntime {
+namespace rocm {
+
+#define REGISTER_KERNEL_TYPED(T)                                   \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      BatchNormalization,                                          \
+      kOnnxDomain,                                                 \
+      7, 8,                                                        \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),  \
+      BatchNorm<T>);                                               \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      BatchNormalization,                                          \
+      kOnnxDomain,                                                 \
+      9, 13,                                                       \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),  \
+      BatchNorm<T>);                                               \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      BatchNormalization,                                          \
+      kOnnxDomain,                                                 \
+      14, 14,                                                      \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())   \
+          .TypeConstraint("U", DataTypeImpl::GetTensorType<T>()),  \
+      BatchNorm<T>);                                               \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
+      BatchNormalization,                                          \
+      kOnnxDomain,                                                 \
+      15,                                                          \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())   \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
+      BatchNorm<T>);
+
+template <typename T>
+Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+
+  const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
+  const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
+  const Tensor* B = p_op_kernel_context->Input<Tensor>(2);
+  const Tensor* mean = p_op_kernel_context->Input<Tensor>(3);
+  const Tensor* var = p_op_kernel_context->Input<Tensor>(4);
+
+  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1));
+
+  const TensorShape& x_shape = X->Shape();
+  const TensorShape& channel_shape = mean->Shape();
+
+  Tensor* Y = p_op_kernel_context->Output(0, x_shape);
+  Tensor* running_mean = p_op_kernel_context->Output(1, channel_shape);
+  Tensor* running_var = p_op_kernel_context->Output(2, channel_shape);
+  Tensor* saved_mean = p_op_kernel_context->Output(3, channel_shape);
+  Tensor* saved_var = p_op_kernel_context->Output(4, channel_shape);
+
+  auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
+  auto scale_data = reinterpret_cast<const HipT*>(scale->Data<T>());
+  auto b_data = reinterpret_cast<const HipT*>(B->Data<T>());
+  auto mean_data = reinterpret_cast<const HipT*>(mean->Data<T>());
+  auto var_data = reinterpret_cast<const HipT*>(var->Data<T>());
+
+  auto y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
+
+  const auto alpha = Consts<HipT>::One;
+  const auto beta = Consts<HipT>::Zero;
+
+  MiopenTensor data_desc;
+  vector<int64_t> new_dims;
+  BatchNormHelper::NormalizeDims(x_shape, new_dims);
+  ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, MiopenTensor::GetDataType<HipT>()));
+
+  // For half data type, the alpha, beta, scale, B, mean, var need to be float type
+  if (X->IsDataType<MLFloat16>()) {
+    MiopenTensor scale_desc;
+    ORT_RETURN_IF_ERROR(scale_desc.Set(new_dims, MiopenTensor::GetDataType<float>()));
+    MiopenTensor bn_tensor_desc;
+    ORT_RETURN_IF_ERROR(bn_tensor_desc.Set(data_desc, miopen_batch_norm_mode_));
+
+    // Convert the scale, B, mean, var to float
+    const int64_t C = x_shape.GetDims()[1];
+    auto f_scale = GetScratchBuffer<float>(C);
+    auto f_B = GetScratchBuffer<float>(C);
+    auto f_mean = GetScratchBuffer<float>(C);
+    auto f_var = GetScratchBuffer<float>(C);
+    Impl_Cast<HipT, float>(Stream(), scale_data, f_scale.get(), C);
+    Impl_Cast<HipT, float>(Stream(), b_data, f_B.get(), C);
+    Impl_Cast<HipT, float>(Stream(), mean_data, f_mean.get(), C);
+    Impl_Cast<HipT, float>(Stream(), var_data, f_var.get(), C);
+
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardInferenceHelper(
+        MiopenHandle(),
+        miopen_batch_norm_mode_,
+        &alpha,
+        &beta,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,
+        bn_tensor_desc,
+        f_scale.get(),
+        f_B.get(),
+        f_mean.get(),
+        f_var.get(),
+        epsilon_));
+
+    return Status::OK();
+  }
+
+  MiopenTensor bn_tensor_desc;
+  ORT_RETURN_IF_ERROR(bn_tensor_desc.Set(data_desc, miopen_batch_norm_mode_));
+
+  // in BatchNorm Forward Training mode if all 5 outputs present
+  if (running_mean && running_var && saved_mean && saved_var) {
+    auto running_mean_data = reinterpret_cast<HipT*>(running_mean->MutableData<T>());
+    auto running_var_data = reinterpret_cast<HipT*>(running_var->MutableData<T>());
+    auto saved_mean_data = reinterpret_cast<HipT*>(saved_mean->MutableData<T>());
+    auto saved_inv_var_data = reinterpret_cast<HipT*>(saved_var->MutableData<T>());
+
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
+        MiopenHandle(),
+        miopen_batch_norm_mode_,
+        &alpha,
+        &beta,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,
+        bn_tensor_desc,
+        scale_data,
+        b_data,
+        momentum_,
+        running_mean_data,
+        running_var_data,
+        epsilon_,
+        saved_mean_data,
+        saved_inv_var_data));
+    // in BatchNorm Forward Inference mode if only Y output present
+  } else {
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardInferenceHelper(
+        MiopenHandle(),
+        miopen_batch_norm_mode_,
+        &alpha,
+        &beta,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,
+        bn_tensor_desc,
+        scale_data,
+        b_data,
+        mean_data,
+        var_data,
+        epsilon_));
+  }
+  return Status::OK();
+}
+
+#define SPECIALIZED_COMPUTE(T) \
+  REGISTER_KERNEL_TYPED(T)     \
+  template Status BatchNorm<T>::ComputeInternal(OpKernelContext* ctx) const;
+
+SPECIALIZED_COMPUTE(float)
+SPECIALIZED_COMPUTE(double)
+SPECIALIZED_COMPUTE(MLFloat16)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/rocm/miopen_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class BatchNorm final : public RocmKernel {
+ public:
+  BatchNorm(const OpKernelInfo& op_kernel_info)
+      : RocmKernel{op_kernel_info},
+        miopen_batch_norm_mode_(miopenBNSpatial),
+        momentum_(0.9) {
+    float tmp_epsilon;
+    ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
+    epsilon_ = ClampMiopenBatchNormEpsilon(static_cast<double>(tmp_epsilon));
+
+    // spatial or not
+    int64_t tmp_spatial;
+    if (op_kernel_info.GetAttr<int64_t>("spatial", &tmp_spatial).IsOK()) {
+      spatial_ = tmp_spatial;
+    }
+
+    if (spatial_ == 0) {
+      miopen_batch_norm_mode_ = miopenBNPerActivation;
+    }
+
+    float tmp_momentum;
+    if (op_kernel_info.GetAttr<float>("momentum", &tmp_momentum).IsOK()) {
+      momentum_ = static_cast<double>(tmp_momentum);
+    }
+
+    is_training_mode_ = (op_kernel_info.GetAttrOrDefault<int64_t>("training_mode", 0) == 1);
+    const auto& node = op_kernel_info.node();
+    auto opset = node.SinceVersion();
+
+    // batch norm opset 14 (or higher) is not implemented for training mode
+    ORT_ENFORCE(!(is_training_mode_ && opset >= 14), "Training mode does not support BN opset 14 (or higher) yet.");
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  double epsilon_;
+  int64_t spatial_ = 1;  // default as per spec
+  miopenBatchNormMode_t miopen_batch_norm_mode_;
+  double momentum_;
+  bool is_training_mode_ = 0;  //default as per spec
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/nn/dropout.h"
+
+#include "core/providers/rocm/nn/dropout_impl.h"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+namespace {
+
+template <typename T>
+struct GetRatioDataImpl {
+  void operator()(const Tensor* ratio, float& ratio_data) const {
+    ratio_data = static_cast<float>(*(ratio->Data<T>()));
+    ORT_ENFORCE(ratio_data >= 0.0f && ratio_data < 1.0f, "ratio_data is outside range [0, 1)");
+  }
+};
+
+template <typename T>
+struct DropoutComputeImpl {
+  void operator()(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N, const int64_t mask_element_count,
+                  const float ratio_data, PhiloxGenerator& generator, const Tensor& X, Tensor& Y, void* mask_data,
+                  bool use_bitmask) const {
+    typedef typename ToHipType<T>::MappedType HipT;
+    const HipT* X_data = reinterpret_cast<const HipT*>(X.Data<T>());
+    HipT* Y_data = reinterpret_cast<HipT*>(Y.MutableData<T>());
+
+    DropoutKernelImpl<HipT>(prop, stream, N, mask_element_count, ratio_data, generator, X_data, Y_data, mask_data,
+                             use_bitmask);
+  }
+};
+
+}  // namespace
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Dropout, kOnnxDomain, 12, 12, kRocmExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes())
+                                      .TypeConstraint("T1", DataTypeImpl::AllIEEEFloatTensorTypes())
+                                      .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>())
+                                      .InputMemoryType(OrtMemTypeCPUInput, 1)
+                                      .InputMemoryType(OrtMemTypeCPUInput, 2),
+                                  Dropout<false>);
+
+ONNX_OPERATOR_KERNEL_EX(Dropout, kOnnxDomain, 13, kRocmExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T", BuildKernelDefConstraints<MLFloat16, float, double, BFloat16>())
+                            .TypeConstraint("T1", BuildKernelDefConstraints<MLFloat16, float, double, BFloat16>())
+                            .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>())
+                            .InputMemoryType(OrtMemTypeCPUInput, 1)
+                            .InputMemoryType(OrtMemTypeCPUInput, 2),
+                        Dropout<false>);
+
+template <bool UseBitmask>
+Status Dropout<UseBitmask>::ComputeInternal(OpKernelContext* context) const {
+  // Get X_data
+  const Tensor* X = context->Input<Tensor>(0);
+  if (!X) return Status(common::ONNXRUNTIME, common::FAIL, "X Input is not available.");
+  const TensorShape& shape = X->Shape();
+  const int64_t N = shape.Size();
+
+  // Get Y_data
+  auto Y = context->Output(0, shape);
+
+  // Get mask_data
+  Tensor* mask = nullptr;
+  int64_t mask_element_count = N;
+  if (UseBitmask) {
+    mask_element_count = (N + kNumBitsPerBitmaskElement - 1) / kNumBitsPerBitmaskElement;
+    mask = context->Output(1, {mask_element_count});
+  } else {
+    mask = context->Output(1, shape);
+  }
+
+  ORT_ENFORCE(!mask || mask->Shape().Size() == mask_element_count);
+
+  // Get the ratio_data
+  float ratio_data = default_ratio_;
+  auto ratio = context->Input<Tensor>(1);
+  if (ratio) {
+    utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(ratio->GetElementType());
+    t_disp.Invoke<GetRatioDataImpl>(ratio, ratio_data);
+  }
+
+  const Tensor* training_mode = context->Input<Tensor>(2);
+  // Check for inference mode.
+  if (ratio_data == 0.f || !training_mode || !(*(training_mode->Data<bool>()))) {
+    const void* X_data = X->DataRaw();
+    void* Y_data = Y->MutableDataRaw();
+    if (Y_data != X_data) {
+      HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y_data, X_data, X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream()));
+    }
+
+    // If mask is requested, return all 1s.
+    if (mask) {
+      if (UseBitmask) {
+        HIP_RETURN_IF_ERROR(
+            hipMemsetAsync(mask->MutableDataRaw(), -1, mask_element_count * sizeof(BitmaskElementType), Stream()));
+      } else {
+        HIP_RETURN_IF_ERROR(
+            hipMemsetAsync(mask->MutableData<bool>(), true, mask_element_count * sizeof(bool), Stream()));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  IAllocatorUniquePtr<void> temp_mask_buffer{};  // buffer to use if mask is not provided
+  void* const mask_data = [this, mask_element_count, mask, &temp_mask_buffer]() {
+    if (mask) return mask->MutableDataRaw();
+    temp_mask_buffer =
+        GetScratchBuffer<void>(mask_element_count * (UseBitmask ? sizeof(BitmaskElementType) : sizeof(bool)));
+    return temp_mask_buffer.get();
+  }();
+
+  PhiloxGenerator& generator = generator_ ? *generator_ : PhiloxGenerator::Default();
+
+  utils::MLTypeCallDispatcher<float, MLFloat16, double, BFloat16> t_disp(X->GetElementType());
+  t_disp.Invoke<DropoutComputeImpl>(GetDeviceProp(), Stream(), N, mask_element_count, ratio_data, generator, *X, *Y,
+                                    mask_data, UseBitmask);
+
+  return Status::OK();
+}
+
+// Instantiation for Dropout.
+template class Dropout<false>;
+template class Dropout<true>;
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/common.h"
+#include "core/framework/random_generator.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <bool UseBitmask>
+class Dropout final : public RocmKernel {
+ public:
+  Dropout(const OpKernelInfo& info) : RocmKernel(info) {
+    int64_t seed = 0;
+    if (info.GetAttr<int64_t>("seed", &seed).IsOK()) {
+      generator_ = std::make_unique<PhiloxGenerator>(static_cast<uint64_t>(seed));
+    }
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  mutable std::unique_ptr<PhiloxGenerator> generator_;
+  static constexpr float default_ratio_ = 0.5f;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.cu
+#include "hip/hip_runtime.h"
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Modifications Copyright (c) Microsoft. */
+
+#include "core/providers/rocm/nn/dropout_impl.h"
+
+#include <hiprand_kernel.h>
+#include <algorithm>
+#include "core/providers/rocm/cu_inc/bitmask.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+constexpr int kBlockSize = 256;
+constexpr int kNumUnroll = 4;
+
+template <typename T, bool UseBitmask>
+__global__ void DropoutKernel(const HIP_LONG N, const HIP_LONG mask_element_count, const int step_size,
+                              const int steps_per_thread, const fast_divmod fdm_bits_per_element, const float ratio,
+                              const std::pair<uint64_t, uint64_t> seeds, const T* X_data, T* Y_data, void* mask_data) {
+  HIP_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  const float p = 1.0f - ratio;
+  const float scale = 1.0f / p;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seeds.first, idx, seeds.second, &state);
+
+  float4 rand;
+
+  // We ensure every thread generates the same number of random numbers (by rounding
+  // up the size) and at the same timestep (by syncing threads).
+  // From ROCM hiprand documentation:
+  //   The Philox_4x32_10 algorithm is closely tied to the thread and block count.
+  //   Each thread computes 4 random numbers in the same time thus the most efficient
+  //   use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
+  for (int i = 0; i < steps_per_thread; ++i) {
+    HIP_LONG id = idx * kNumUnroll + i * step_size;
+    rand = hiprand_uniform4(&state);
+    BitmaskElementType thread_bitmask = 0;
+
+// actual computation
+#pragma unroll
+    for (int i = 0; i < kNumUnroll; ++i) {
+      HIP_LONG li = id + i;
+      if (li < N) {
+        bool mask = (&rand.x)[i] < p;
+        Y_data[li] = static_cast<T>(static_cast<float>(X_data[li]) * mask * scale);
+        if (UseBitmask) {
+          thread_bitmask |= (mask << i);
+        } else {
+          reinterpret_cast<bool*>(mask_data)[li] = mask;
+        }
+      }
+    }
+
+    if (UseBitmask) {
+      SetBitmask<kNumUnroll>(id, mask_element_count, fdm_bits_per_element, thread_bitmask,
+                             reinterpret_cast<BitmaskElementType*>(mask_data));
+    }
+
+    __syncthreads();
+  }
+}
+
+template <typename T, bool UseBitmask>
+__global__ void DropoutVectorizedKernel(const HIP_LONG N, const HIP_LONG mask_element_count, const int step_size,
+                                        const int steps_per_thread, const fast_divmod fdm_bits_per_element,
+                                        const float ratio, const std::pair<uint64_t, uint64_t> seeds, const T* X_data,
+                                        T* Y_data, void* mask_data) {
+  HIP_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+
+  const float p = 1.0f - ratio;
+  const float scale = 1.0f / p;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seeds.first, idx, seeds.second, &state);
+
+  float4 rand;
+
+  // using vectorized data load/store approach when N % 4 == 0 since this is
+  // typical case for input shape size
+  using LoadT = aligned_vector<T, kNumUnroll>;
+  using MaskLoadT = aligned_vector<bool, kNumUnroll>;
+
+  for (int i = 0; i < steps_per_thread; ++i) {
+    HIP_LONG id = idx * kNumUnroll + i * step_size;
+    rand = hiprand_uniform4(&state);
+    BitmaskElementType thread_bitmask = 0;
+
+    if (id < N) {
+      // vectorized load into storage
+      T src[kNumUnroll];
+      LoadT* value = reinterpret_cast<LoadT*>(&src);
+      *value = *reinterpret_cast<const LoadT*>(&X_data[id]);
+
+      T r[kNumUnroll];
+      bool masks[kNumUnroll];
+
+// actual computation
+#pragma unroll
+      for (int ii = 0; ii < kNumUnroll; ++ii) {
+        bool mask = (&rand.x)[ii] < p;
+        r[ii] = static_cast<T>(static_cast<float>(src[ii]) * mask * scale);
+        if (UseBitmask) {
+          thread_bitmask |= (mask << ii);
+        } else {
+          masks[ii] = mask;
+        }
+      }
+      // Vectorized writes for mask_data & Y_data
+      *(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
+      if (!UseBitmask) {
+        *(reinterpret_cast<MaskLoadT*>(&reinterpret_cast<bool*>(mask_data)[id])) =
+            *reinterpret_cast<MaskLoadT*>(&masks[0]);
+      }
+    }
+
+    if (UseBitmask) {
+      SetBitmask<kNumUnroll>(id, mask_element_count, fdm_bits_per_element, thread_bitmask,
+                             reinterpret_cast<BitmaskElementType*>(mask_data));
+    }
+
+    __syncthreads();
+  }
+}
+
+#define LAUNCH_DROPOUT_KERNEL(FuncName, UseBitmask)                                                       \
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(FuncName<T, UseBitmask>), grid_size, kBlockSize, 0, stream,                                           \
+      static_cast<HIP_LONG>(N), static_cast<HIP_LONG>(mask_element_count), step_size, steps_per_thread, \
+      fdm_bits_per_element, ratio, seeds, X_data, Y_data, mask_data)
+
+#define HANDLE_DROPOUT_USE_BITMASK(FuncName) \
+  if (use_bitmask) {                         \
+    LAUNCH_DROPOUT_KERNEL(FuncName, true);   \
+  } else {                                   \
+    LAUNCH_DROPOUT_KERNEL(FuncName, false);  \
+  }
+
+template <typename T>
+void DropoutKernelImpl(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N,
+                       const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, const T* X_data,
+                       T* Y_data, void* mask_data, bool use_bitmask) {
+  const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / kBlockSize;
+  const int grid_size =
+      std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, kBlockSize * kNumUnroll)));
+
+  // Compute the number of random numbers generated by each thread, and increment philox generator offset by that
+  // amount.
+  const int step_size = kBlockSize * grid_size * kNumUnroll;
+  const int steps_per_thread = static_cast<int>(CeilDiv(N, step_size));
+  auto seeds = generator.NextPhiloxSeeds(static_cast<uint64_t>(steps_per_thread * kNumUnroll));
+
+  fast_divmod fdm_bits_per_element(kNumBitsPerBitmaskElement);
+  if (N % kNumUnroll != 0) {
+    HANDLE_DROPOUT_USE_BITMASK(DropoutKernel);
+  } else {
+    HANDLE_DROPOUT_USE_BITMASK(DropoutVectorizedKernel);
+  }
+}
+
+#undef HANDLE_DROPOUT_USE_BITMASK
+#undef LAUNCH_DROPOUT_KERNEL
+
+#define SPECIALIZED_DROPOUT_IMPL(T)                                                                                   \
+  template void DropoutKernelImpl<T>(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N,                \
+                                     const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, \
+                                     const T* X_data, T* Y_data, void* mask_data, bool use_bitmask);
+
+SPECIALIZED_DROPOUT_IMPL(float)
+SPECIALIZED_DROPOUT_IMPL(double)
+SPECIALIZED_DROPOUT_IMPL(half)
+SPECIALIZED_DROPOUT_IMPL(BFloat16)
+
+#undef SPECIALIZED_DROPOUT_IMPL
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/random_generator.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+void DropoutKernelImpl(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N,
+                       const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, const T* X_data,
+                       T* Y_data, void* mask_data, bool use_bitmask);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "instance_norm.h"
+#include "instance_norm_impl.h"
+#include "core/providers/cpu/nn/instance_norm_helper.h"
+#include "core/providers/cpu/nn/batch_norm_helper.h"
+#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      InstanceNormalization,                                      \
+      kOnnxDomain,                                                \
+      6,                                                          \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      InstanceNorm<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(double)
+REGISTER_KERNEL_TYPED(MLFloat16)
+
+template <typename T>
+InstanceNorm<T>::InstanceNorm(const OpKernelInfo& op_kernel_info)
+    : RocmKernel(op_kernel_info) {
+  float tmp_epsilon;
+  ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
+  epsilon_ = ClampMiopenBatchNormEpsilon(tmp_epsilon);
+}
+
+template <typename T>
+Status InstanceNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+
+  const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
+  const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
+  const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
+
+  ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(X, scale, bias));
+
+  const TensorShape& x_shape = X->Shape();
+  Tensor* Y = p_op_kernel_context->Output(0, x_shape);
+
+  auto* y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
+  const auto* x_data = reinterpret_cast<const HipT*>(X->Data<T>());
+  const auto* scale_data = reinterpret_cast<const HipT*>(scale->Data<T>());
+  const auto* bias_data = reinterpret_cast<const HipT*>(bias->Data<T>());
+
+  const auto& x_dims = x_shape.GetDims();
+  const int64_t N = x_dims[0];
+  const int64_t C = x_dims[1];
+  const auto one = Consts<HipT>::One;
+  const auto zero = Consts<HipT>::Zero;
+
+  if (N == 1) {
+    // when N == 1, we can treat it as spatial batch normalization in training
+    // as the mean/variance would be computed from input
+
+    MiopenTensor data_desc;
+    std::vector<int64_t> new_dims;
+    BatchNormHelper::NormalizeDims(x_shape, new_dims);
+    ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, MiopenTensor::GetDataType<HipT>()));
+
+    MiopenTensor stats_desc;
+    ORT_RETURN_IF_ERROR(stats_desc.Set(data_desc, miopenBNSpatial));
+
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
+        MiopenHandle(),
+        miopenBNSpatial,
+        &one,
+        &zero,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,
+        stats_desc,
+        scale_data,
+        bias_data,
+        1.0f,
+        nullptr,
+        nullptr,
+        epsilon_,
+        nullptr,
+        nullptr));
+  } else {
+    // we use miopenBatchNormalizationForwardTraining to compute mean/variance
+    // so collapsing NC into channel
+
+    auto input_count = x_shape.Size();              // N * C * H * W
+    auto stats_count = x_shape.SizeToDimension(2);  // N * C
+    auto image_size = input_count / stats_count;
+
+    MiopenTensor data_desc;
+    ORT_RETURN_IF_ERROR(data_desc.Set(std::array<int64_t, 4>{1, stats_count, image_size, 1}, MiopenTensor::GetDataType<HipT>()));
+
+    MiopenTensor stats_desc;
+    ORT_RETURN_IF_ERROR(stats_desc.Set(std::array<int64_t, 4>{1, stats_count, 1, 1}, MiopenTensor::GetDataType<HipT>()));
+
+    const size_t stats_byte_count = stats_count * sizeof(HipT);
+
+    // Mean & Variance are inputs & outputs and must be initialized to zero to work properly
+    auto mean = GetScratchBuffer<HipT>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(mean.get(), 0, stats_byte_count, Stream()));
+    auto variance = GetScratchBuffer<HipT>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(variance.get(), 0, stats_byte_count, Stream()));
+
+    // We must set the scale & bias inputs to zero as they are inputs to the calculation
+    auto unused_scale = GetScratchBuffer<HipT>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream()));
+    auto unused_bias = GetScratchBuffer<HipT>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream()));
+
+    // first, compute mean and variance per-instance per-channel using miopenBatchNorm training
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
+        MiopenHandle(),
+        miopenBNSpatial,
+        &one,
+        &zero,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,  // use y temporarily, would be rewritten later
+        stats_desc,
+        unused_scale.get(),
+        unused_bias.get(),
+        1.0f,
+        mean.get(),
+        variance.get(),
+        MIOPEN_BN_MIN_EPSILON,
+        nullptr,
+        nullptr));
+
+    // Y = scale * (x - mean) / sqrt (variance + epsilon) + B
+    // X/Y is (N,C,H,W)
+    // scale/bias is (1,C,1,1)
+    // mean/stddev is (N,C,1,1)
+    // NOTE miopenBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
+    // and it needs to be corrected with (count - 1) / count
+    fast_divmod fdm_HW(gsl::narrow_cast<int>(image_size));
+    fast_divmod fdm_C(gsl::narrow_cast<int>(C));
+
+    InstanceNormImpl<HipT>(
+        Stream(),
+        x_data,
+        scale_data,
+        bias_data,
+        mean.get(),
+        variance.get(),
+        (image_size - 1.0) / image_size,
+        static_cast<double>(epsilon_),
+        fdm_HW,
+        fdm_C,
+        y_data,
+        input_count);
+  }
+  return Status::OK();
+}
+
+template <>
+Status InstanceNorm<MLFloat16>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
+  typedef typename ToHipType<MLFloat16>::MappedType HipT;
+
+  const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
+  const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
+  const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
+
+  ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(X, scale, bias));
+
+  const TensorShape& x_shape = X->Shape();
+  Tensor* Y = p_op_kernel_context->Output(0, x_shape);
+
+  auto* y_data = reinterpret_cast<HipT*>(Y->MutableData<MLFloat16>());
+  const auto* x_data = reinterpret_cast<const HipT*>(X->Data<MLFloat16>());
+  const auto* scale_data = reinterpret_cast<const HipT*>(scale->Data<MLFloat16>());
+  const auto* bias_data = reinterpret_cast<const HipT*>(bias->Data<MLFloat16>());
+
+  const auto& x_dims = x_shape.GetDims();
+  const int64_t N = x_dims[0];
+  const int64_t C = x_dims[1];
+  const auto one = Consts<HipT>::One;
+  const auto zero = Consts<HipT>::Zero;
+
+  if (N == 1) {
+    // when N == 1, we can treat it as spatial batch normalization in training
+    // as the mean/variance would be computed from input
+
+    MiopenTensor data_desc;
+    std::vector<int64_t> new_dims;
+    BatchNormHelper::NormalizeDims(x_shape, new_dims);
+    ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, MiopenTensor::GetDataType<HipT>()));
+
+    MiopenTensor stats_desc;
+    ORT_RETURN_IF_ERROR(stats_desc.Set(data_desc, miopenBNSpatial));
+
+    // For half input data type, alpha, beta, scale, bias need to be float type.
+    // alpha, beta will be of type float as the Consts struct specialization
+    // for MLFloat16 type take care of that. Only Convert the scale, bias to float)
+
+    auto scale_data_fp32 = GetScratchBuffer<float>(C);
+    Impl_Cast<HipT, float>(Stream(), scale_data, scale_data_fp32.get(), C);
+
+    auto bias_data_fp32 = GetScratchBuffer<float>(C);
+    Impl_Cast<HipT, float>(Stream(), bias_data, bias_data_fp32.get(), C);
+
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
+        MiopenHandle(),
+        miopenBNSpatial,
+        &one,
+        &zero,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,
+        stats_desc,
+        scale_data_fp32.get(),
+        bias_data_fp32.get(),
+        1.0f,
+        nullptr,
+        nullptr,
+        epsilon_,
+        nullptr,
+        nullptr));
+  } else {
+    // we use miopenBatchNormalizationForwardTraining to compute mean/variance
+    // so collapsing NC into channel
+
+    auto input_count = x_shape.Size();              // N * C * H * W
+    auto stats_count = x_shape.SizeToDimension(2);  // N * C
+    auto image_size = input_count / stats_count;
+
+    MiopenTensor data_desc;
+    ORT_RETURN_IF_ERROR(data_desc.Set(std::array<int64_t, 4>{1, stats_count, image_size, 1},
+                                      MiopenTensor::GetDataType<HipT>()));
+
+    // stats_desc needs to be of 'float' type even for float16 input as the "stats" are of float type
+    MiopenTensor stats_desc;
+    ORT_RETURN_IF_ERROR(stats_desc.Set(std::array<int64_t, 4>{1, stats_count, 1, 1},
+                                       MiopenTensor::GetDataType<float>()));
+
+    // For half input data type, we need to allocate some "intermediate"
+    // float buffers for CuDNN to use.
+    const size_t stats_byte_count = stats_count * sizeof(float);
+
+    // Mean & Variance are inputs & outputs and must be initialized to zero to work properly
+    auto mean = GetScratchBuffer<float>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(mean.get(), 0, stats_byte_count, Stream()));
+    auto variance = GetScratchBuffer<float>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(variance.get(), 0, stats_byte_count, Stream()));
+
+    // We must set the scale & bias inputs to zero as they are inputs to the calculation
+    auto unused_scale = GetScratchBuffer<float>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream()));
+    auto unused_bias = GetScratchBuffer<float>(stats_count);
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream()));
+
+    // first, compute mean and variance per-instance per-channel using miopenBatchNorm training
+    MIOPEN_RETURN_IF_ERROR(BatchNormalizationForwardTrainingHelper(
+        MiopenHandle(),
+        miopenBNSpatial,
+        &one,
+        &zero,
+        data_desc,
+        x_data,
+        data_desc,
+        y_data,  // use y temporarily, would be rewritten later
+        stats_desc,
+        unused_scale.get(),
+        unused_bias.get(),
+        1.0f,
+        mean.get(),
+        variance.get(),
+        MIOPEN_BN_MIN_EPSILON,
+        nullptr,
+        nullptr));
+
+    // Y = scale * (x - mean) / sqrt (variance + epsilon) + B
+    // X/Y is (N,C,H,W)
+    // scale/bias is (1,C,1,1)
+    // mean/stddev is (N,C,1,1)
+    // NOTE miopenBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
+    // and it needs to be corrected with (count - 1) / count
+    fast_divmod fdm_HW(gsl::narrow_cast<int>(image_size));
+    fast_divmod fdm_C(gsl::narrow_cast<int>(C));
+
+    // The InstanceNormImpl kernel handles the mean/variance in float32, so no casting required here
+    InstanceNormImpl<HipT, float>(
+        Stream(),
+        x_data,
+        scale_data,
+        bias_data,
+        mean.get(),
+        variance.get(),
+        (image_size - 1.0) / image_size,
+        static_cast<double>(epsilon_),
+        fdm_HW,
+        fdm_C,
+        y_data,
+        input_count);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/rocm/miopen_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class InstanceNorm final : public RocmKernel {
+ public:
+  InstanceNorm(const OpKernelInfo& op_kernel_info);
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
+
+ private:
+  double epsilon_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "instance_norm_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T1, typename T2>
+__global__ void _InstanceNormKernel(
+    const T1* __restrict__ input_data,
+    const T1* __restrict__ scale,
+    const T1* __restrict__ bias,
+    const T2* __restrict__ mean,
+    const T2* __restrict__ variance,
+    const double variance_correction,
+    const double epsilon,
+    const fast_divmod fdm_HW,
+    const fast_divmod fdm_C,
+    T1* __restrict__ output_data,
+    const HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int nc = fdm_HW.div(id);
+  int n, c;
+  fdm_C.divmod(nc, n, c);
+
+  // Y = scale * (x - mean) / sqrt (std * std + epsilon) + B
+  output_data[id] = scale[c] * (input_data[id] - (T1)mean[nc]) / _Sqrt((T1)variance[nc] * (T1)variance_correction + (T1)epsilon) + bias[c];
+}
+
+template <typename T1, typename T2>
+void InstanceNormImpl(
+    hipStream_t stream,
+    const T1* input_data,
+    const T1* scale,
+    const T1* bias,
+    const T2* mean,
+    const T2* variance,
+    const double variance_correction,
+    const double epsilon,
+    const fast_divmod& fdm_HW,
+    const fast_divmod& fdm_C,
+    T1* output_data,
+    size_t N) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_InstanceNormKernel<T1, T2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      input_data, scale, bias, mean, variance, variance_correction, epsilon, fdm_HW, fdm_C, output_data, (HIP_LONG)N);
+}
+
+#define SPECIALIZED_IMPL(T1, T2) \
+  template void InstanceNormImpl<T1, T2>(hipStream_t stream, const T1* input_data, const T1* scale, const T1* bias, const T2* mean, const T2* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T1* output_data, size_t count);
+
+SPECIALIZED_IMPL(float, float)
+SPECIALIZED_IMPL(double, double)
+// When the input data type is float16, the means and variances will flow in as float32 (special case)
+SPECIALIZED_IMPL(half, float)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/rocm/shared_inc/fast_divmod.h"
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T1, typename T2>
+void InstanceNormImpl(
+    hipStream_t stream,
+    const T1* input_data,
+    const T1* scale,
+    const T1* bias,
+    const T2* mean,
+    const T2* variance,
+    const double variance_correction,
+    const double epsilon,
+    const fast_divmod& fdm_HW,
+    const fast_divmod& fdm_C,
+    T1* output_data,
+    size_t count);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/nn/layer_norm.h"
+#include "core/providers/rocm/nn/layer_norm_impl.h"
+#include "core/providers/rocm/rocm_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+#define REGISTER_KERNEL_TYPED(T, U)                                                             \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(LayerNormalization, kOnnxDomain, 17, T, kRocmExecutionProvider, \
+                                (*KernelDefBuilder::Create())                                   \
+                                    .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())      \
+                                    .TypeConstraint("U", DataTypeImpl::GetTensorType<U>()),     \
+                                LayerNorm<T, U, T, false>);
+
+REGISTER_KERNEL_TYPED(float, float)
+REGISTER_KERNEL_TYPED(double, float)
+REGISTER_KERNEL_TYPED(MLFloat16, float)
+REGISTER_KERNEL_TYPED(BFloat16, float)
+
+template <typename T, typename U, typename V, bool simplified>
+LayerNorm<T, U, V, simplified>::LayerNorm(const OpKernelInfo& op_kernel_info) : RocmKernel(op_kernel_info) {
+  ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK());
+  float tmp_epsilon;
+  ORT_ENFORCE(op_kernel_info.GetAttr<float>("epsilon", &tmp_epsilon).IsOK());
+  epsilon_ = tmp_epsilon;
+}
+
+template <typename T, typename U, typename V, bool simplified>
+Status LayerNorm<T, U, V, simplified>::ComputeInternal(OpKernelContext* ctx) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+  typedef typename ToHipType<U>::MappedType CudaU;
+  typedef typename ToHipType<V>::MappedType CudaV;
+  // Inputs
+  const Tensor* X = ctx->Input<Tensor>(0);
+  const Tensor* scale = ctx->Input<Tensor>(1);
+  const Tensor* bias = ctx->Input<Tensor>(2);
+
+  auto X_data = reinterpret_cast<const HipT*>(X->Data<T>());
+  auto scale_data = reinterpret_cast<const CudaV*>(scale->Data<V>());
+  auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast<const CudaV*>(bias->Data<V>());
+
+  const TensorShape& x_shape = X->Shape();
+  const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
+
+  int n1 = gsl::narrow<int>(x_shape.SizeToDimension(axis));
+  int n2 = gsl::narrow<int>(x_shape.SizeFromDimension(axis));
+
+  const auto scale_size = scale->Shape().Size();
+  const auto bias_size = (bias_data) ? bias->Shape().Size() : 0;
+  if (n2 == 1 || scale_size != n2 || (bias_data && bias_size != n2)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Size of X.shape()[axis:] == ", n2,
+                           ". Size of scale and bias (if provided) must match this "
+                           "and the size must not be 1. Got scale size of ",
+                           scale_size, " and bias size of ", bias_size);
+  }
+
+  // Outputs
+  Tensor* Y = ctx->Output(0, x_shape);
+  auto Y_data = reinterpret_cast<CudaV*>(Y->MutableData<V>());
+
+  // Mean and variance
+  std::vector<int64_t> mean_inv_std_var_dim;
+  for (int i = 0; i < static_cast<int>(x_shape.NumDimensions()); ++i) {
+    if (i < axis) {
+      mean_inv_std_var_dim.emplace_back(x_shape.GetDims()[i]);
+    } else {
+      mean_inv_std_var_dim.emplace_back(1);
+    }
+  }
+  int output_index = 1;
+
+  CudaU* mean_data = nullptr;
+
+  if (!simplified) {
+    Tensor* mean = ctx->Output(output_index++, TensorShape(mean_inv_std_var_dim));
+    if (mean != nullptr) {
+      mean_data = reinterpret_cast<CudaU*>(mean->MutableData<U>());
+    }
+  }
+
+  CudaU* inv_var_data = nullptr;
+  Tensor* var = ctx->Output(output_index, TensorShape(mean_inv_std_var_dim));
+  if (var != nullptr) {
+    inv_var_data = reinterpret_cast<CudaU*>(var->MutableData<U>());
+  }
+
+  if (x_shape.Size() == 0) {
+    return Status::OK();
+  }
+
+  HostApplyLayerNorm<HipT, CudaU, CudaV, simplified>(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data,
+                                                      X_data, n1, n2, epsilon_, scale_data, bias_data);
+  return Status::OK();
+}
+
+#if !defined(DISABLE_CONTRIB_OPS)
+#define LAYERNORM_IMPL(T, U, V, simplified) \
+  template class LayerNorm<T, U, V, simplified>;
+
+// contrib op usage
+LAYERNORM_IMPL(float, float, float, false)
+LAYERNORM_IMPL(double, double, double, false)
+LAYERNORM_IMPL(MLFloat16, float, MLFloat16, false)
+LAYERNORM_IMPL(float, float, MLFloat16, false)
+LAYERNORM_IMPL(MLFloat16, float, float, false)
+LAYERNORM_IMPL(BFloat16, float, BFloat16, false)
+
+LAYERNORM_IMPL(float, float, float, true)
+LAYERNORM_IMPL(double, double, double, true)
+LAYERNORM_IMPL(MLFloat16, float, MLFloat16, true)
+LAYERNORM_IMPL(float, float, MLFloat16, true)
+LAYERNORM_IMPL(MLFloat16, float, float, true)
+LAYERNORM_IMPL(BFloat16, float, BFloat16, true)
+#endif
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+using namespace onnxruntime::rocm;
+
+// NOTE: This was originally a contrib op with 3 type constraints. The ONNX spec merges 'T' and 'V'.
+// the kernel is templatized on all three for backwards compatibility, but in ONNX usage T == V.
+template <typename T, typename U, typename V, bool simplified>
+class LayerNorm final : public RocmKernel {
+ public:
+  LayerNorm(const OpKernelInfo& op_kernel_info);
+
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+
+ private:
+  int64_t axis_;
+  double epsilon_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.cu
+#include "hip/hip_runtime.h"
+/**
+* Copyright (c) 2016-present, Facebook, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+//
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// NVIDIA/apex is licensed under the
+// BSD 3 - Clause "New" or "Revised" License
+//
+
+/* Modifications Copyright (c) Microsoft. */
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+
+#include "layer_norm_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+using namespace onnxruntime::rocm;
+
+template <typename U, bool simplified>
+__device__ void cuWelfordOnlineSum(
+    const U curr,
+    U& mu,
+    U& sigma2,
+    U& count) {
+  count = count + U(1);
+  U delta = curr - mu;
+  U lmean = mu + delta / count;
+  mu = lmean;
+  if (simplified) {
+    sigma2 = sigma2 + curr * curr;
+  } else {
+    U delta2 = curr - lmean;
+    sigma2 = sigma2 + delta * delta2;
+  }
+}
+
+template <typename U, bool simplified>
+__device__ void cuChanOnlineSum(
+    const U muB,
+    const U sigma2B,
+    const U countB,
+    U& mu,
+    U& sigma2,
+    U& count) {
+  U delta = muB - mu;
+  U nA = count;
+  U nB = countB;
+  count = count + countB;
+  U nX = count;
+  if (nX > U(0)) {
+    nA = nA / nX;
+    nB = nB / nX;
+    mu = nA * mu + nB * muB;
+    if (simplified) {
+      sigma2 = sigma2 + sigma2B;
+    } else {
+      sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
+    }
+  } else {
+    mu = U(0);
+    sigma2 = U(0);
+  }
+}
+
+template <typename T, typename U, bool simplified>
+__device__ void cuWelfordMuSigma2(
+    const T* __restrict__ vals,
+    const int n1,
+    const int n2,
+    const int i1,
+    U& mu,
+    U& sigma2,
+    U* buf) {
+  // Assumptions:
+  // 1) blockDim.x == GPU_WARP_SIZE
+  // 2) Tensor is contiguous
+  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
+  //
+  // compute variance and mean over n2
+  U count = U(0);
+  mu = U(0);
+  sigma2 = U(0);
+  if (i1 < n1) {
+    // one warp normalizes one n1 index,
+    // synchronization is implicit
+    // initialize with standard Welford algorithm
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const T* lvals = vals + i1 * n2;
+    int l = 4 * thrx;
+    for (; l + 3 < n2; l += 4 * numx) {
+      for (int k = 0; k < 4; ++k) {
+        U curr = static_cast<U>(lvals[l + k]);
+        cuWelfordOnlineSum<U, simplified>(curr, mu, sigma2, count);
+      }
+    }
+    for (; l < n2; ++l) {
+      U curr = static_cast<U>(lvals[l]);
+      cuWelfordOnlineSum<U, simplified>(curr, mu, sigma2, count);
+    }
+// intra-warp reductions
+#pragma unroll
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      U muB = WARP_SHFL_DOWN(mu, stride);
+      U countB = WARP_SHFL_DOWN(count, stride);
+      U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      cuChanOnlineSum<U, simplified>(muB, sigma2B, countB, mu, sigma2, count);
+    }
+
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      U* ubuf = (U*)buf;
+      U* ibuf = (U*)(ubuf + blockDim.y);
+      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
+          const int wrt_y = threadIdx.y - offset;
+          ubuf[2 * wrt_y] = mu;
+          ubuf[2 * wrt_y + 1] = sigma2;
+          ibuf[wrt_y] = count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          U muB = ubuf[2 * threadIdx.y];
+          U sigma2B = ubuf[2 * threadIdx.y + 1];
+          U countB = ibuf[threadIdx.y];
+          cuChanOnlineSum<U, simplified>(muB, sigma2B, countB, mu, sigma2, count);
+        }
+        __syncthreads();
+      }
+      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ubuf[0] = mu;
+        ubuf[1] = sigma2;
+      }
+      __syncthreads();
+      mu = ubuf[0];
+      sigma2 = ubuf[1] / U(n2);
+      // don't care about final value of count, we know count == n2
+    } else {
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
+    }
+  }
+}
+
+template <bool simplified>
+__device__ void cuWelfordMuSigma2(
+    const half* __restrict__ vals,
+    const int n1,
+    const int n2,
+    const int i1,
+    float& mu,
+    float& sigma2,
+    float* buf) {
+  // Assumptions:
+  // 1) blockDim.x == GPU_WARP_SIZE
+  // 2) Tensor is contiguous
+  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
+  //
+  // compute variance and mean over n2
+  float count = 0.0f;
+  mu = float(0);
+  sigma2 = float(0);
+  if (i1 < n1) {
+    // one warp normalizes one n1 index,
+    // synchronization is implicit
+    // initialize with standard Welford algorithm
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const half* lvals = vals + i1 * n2;
+    int l = 8 * thrx;
+    if ((((size_t)lvals) & 3) != 0) {
+      // 16 bit alignment
+      // first thread consumes first point
+      if (thrx == 0) {
+        float curr = static_cast<float>(lvals[0]);
+        cuWelfordOnlineSum<float, simplified>(curr, mu, sigma2, count);
+      }
+      ++l;
+    }
+    // at this point, lvals[l] are 32 bit aligned for all threads.
+    for (; l + 7 < n2; l += 8 * numx) {
+      for (int k = 0; k < 8; k += 2) {
+        float2 curr = __half22float2(*((__half2*)(lvals + l + k)));
+        cuWelfordOnlineSum<float, simplified>(curr.x, mu, sigma2, count);
+        cuWelfordOnlineSum<float, simplified>(curr.y, mu, sigma2, count);
+      }
+    }
+    for (; l < n2; ++l) {
+      float curr = static_cast<float>(lvals[l]);
+      cuWelfordOnlineSum<float, simplified>(curr, mu, sigma2, count);
+    }
+// intra-warp reductions
+#pragma unroll
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      float muB = WARP_SHFL_DOWN(mu, stride);
+      float countB = WARP_SHFL_DOWN(count, stride);
+      float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      cuChanOnlineSum<float, simplified>(muB, sigma2B, countB, mu, sigma2, count);
+    }
+
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      float* ubuf = (float*)buf;
+      float* ibuf = (float*)(ubuf + blockDim.y);
+      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2 * offset) {
+          const int wrt_y = threadIdx.y - offset;
+          ubuf[2 * wrt_y] = mu;
+          ubuf[2 * wrt_y + 1] = sigma2;
+          ibuf[wrt_y] = count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          float muB = ubuf[2 * threadIdx.y];
+          float sigma2B = ubuf[2 * threadIdx.y + 1];
+          float countB = ibuf[threadIdx.y];
+          cuChanOnlineSum<float, simplified>(muB, sigma2B, countB, mu, sigma2, count);
+        }
+        __syncthreads();
+      }
+      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ubuf[0] = mu;
+        ubuf[1] = sigma2;
+      }
+      __syncthreads();
+      mu = ubuf[0];
+      sigma2 = ubuf[1] / float(n2);
+      // don't care about final value of count, we know count == n2
+    } else {
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
+    }
+  }
+}
+
+template <typename U>
+__device__ U rsqrt(U v) {
+  return U(1) / sqrt(v);
+}
+template <>
+__device__ float rsqrt(float v) {
+  return rsqrtf(v);
+}
+template <>
+__device__ double rsqrt(double v) {
+  return rsqrt(v);
+}
+
+namespace {
+// This is the un-specialized struct.  Note that we prevent instantiation of this
+// struct by putting an undefined symbol in the function body so it won't compile.
+//  template <typename T>
+//  struct SharedMemory
+//  {
+//      // Ensure that we won't compile any un-specialized types
+//      __device__ T *getPointer()
+//      {
+//          extern __device__ void error(void);
+//          error();
+//          return NULL;
+//      }
+//  };
+// https://github.com/NVIDIA/apex/issues/246
+template <typename T>
+struct SharedMemory;
+
+template <>
+struct SharedMemory<float> {
+  __device__ float* getPointer() {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct SharedMemory<double> {
+  __device__ double* getPointer() {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+}  // namespace
+
+template <typename T, typename U, typename V, bool simplified>
+__global__ void cuApplyLayerNorm(
+    V* __restrict__ output_vals,
+    U* __restrict__ mean,
+    U* __restrict__ inv_std_dev,
+    const T* __restrict__ vals,
+    const int n1,
+    const int n2,
+    const U epsilon,
+    const V* __restrict__ gamma,
+    const V* __restrict__ beta) {
+  // Assumptions:
+  // 1) blockDim.x == GPU_WARP_SIZE
+  // 2) Tensors are contiguous
+  //
+  for (int i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
+    SharedMemory<U> shared;
+    U* buf = shared.getPointer();
+    U mu, sigma2;
+    cuWelfordMuSigma2<T, U, simplified>(vals, n1, n2, i1, mu, sigma2, buf);
+    const T* lvals = vals + i1 * n2;
+    V* ovals = output_vals + i1 * n2;
+    U c_inv_std_dev = rsqrt(sigma2 + epsilon);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    for (int i = thrx; i < n2; i += numx) {
+      U curr = static_cast<U>(lvals[i]);
+      V gamma_i = (gamma != NULL) ? gamma[i] : (V)1;
+      V beta_i = (beta != NULL) ? beta[i] : (V)0;
+      if (simplified) {
+        ovals[i] = gamma_i * static_cast<V>(c_inv_std_dev * curr);
+      } else {
+        ovals[i] = gamma_i * static_cast<V>(c_inv_std_dev * (curr - mu)) + beta_i;
+      }
+    }
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      if (mean != nullptr) mean[i1] = mu;
+      if (inv_std_dev != nullptr) inv_std_dev[i1] = c_inv_std_dev;
+    }
+  }
+}
+
+template <typename T, typename U, typename V, bool simplified>
+void HostApplyLayerNorm(
+    const hipDeviceProp_t& prop,
+    hipStream_t stream,
+    V* output,
+    U* mean,
+    U* inv_std_dev,
+    const T* input,
+    int n1,
+    int n2,
+    double epsilon,
+    const V* gamma,
+    const V* beta) {
+  const int maxGridY = prop.maxGridSize[1];
+  const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE_HOST);
+
+  dim3 threads(warp_size, 4, 1);
+#ifdef __HIP_PLATFORM_HCC__
+  // Optimization for ROCm MI100
+  threads.y = 1;
+#endif
+  const dim3 blocks(1, std::min<unsigned int>(n1, maxGridY), 1);
+  int nshared =
+      threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(cuApplyLayerNorm<T, U, V, simplified>), blocks, threads, nshared, stream, 
+      output,
+      mean,
+      inv_std_dev,
+      input,
+      n1, n2,
+      U(epsilon),
+      gamma, beta);
+}
+
+#define LAYERNORM_LINEAR_IMPL(T, U, V, simplified)                                                                  \
+  template void HostApplyLayerNorm<T, U, V, simplified>(const hipDeviceProp_t& prop, hipStream_t stream, V* output, \
+                                                        U* mean, U* inv_std_dev, const T* input, int n1, int n2,    \
+                                                        double epsilon, const V* gamma, const V* beta);
+
+LAYERNORM_LINEAR_IMPL(float, float, float, true)
+LAYERNORM_LINEAR_IMPL(half, float, half, true)
+LAYERNORM_LINEAR_IMPL(double, double, double, true)
+LAYERNORM_LINEAR_IMPL(float, float, half, true)
+LAYERNORM_LINEAR_IMPL(half, float, float, true)
+LAYERNORM_LINEAR_IMPL(float, float, float, false)
+LAYERNORM_LINEAR_IMPL(half, float, half, false)
+LAYERNORM_LINEAR_IMPL(double, double, double, false)
+LAYERNORM_LINEAR_IMPL(double, float, double, false)
+LAYERNORM_LINEAR_IMPL(float, float, half, false)
+LAYERNORM_LINEAR_IMPL(half, float, float, false)
+LAYERNORM_LINEAR_IMPL(BFloat16, float, BFloat16, true)
+LAYERNORM_LINEAR_IMPL(BFloat16, float, BFloat16, false)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.h
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// NVIDIA/apex is licensed under the
+// BSD 3 - Clause "New" or "Revised" License
+//
+
+/* Modifications Copyright (c) Microsoft. */
+
+#pragma once
+#include "core/providers/rocm/rocm_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T, typename U, typename V, bool simplified>
+void HostApplyLayerNorm(
+    const hipDeviceProp_t& prop,
+    hipStream_t stream,
+    V* output,
+    U* mean,
+    U* invvar,
+    const T* input,
+    int n1,
+    int n2,
+    double epsilon,
+    const V* gamma,
+    const V* beta);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "lrn.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T)                             \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
+      LRN,                                                                                 \
+      kOnnxDomain,                                                                         \
+      START_VER,                                                                           \
+      END_VER,                                                                             \
+      T,                                                                                   \
+      kRocmExecutionProvider,                                                              \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      LRN<T>);
+
+#define REGISTER_KERNEL_TYPED(VER, T)                                                      \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+      LRN,                                                                                 \
+      kOnnxDomain,                                                                         \
+      VER,                                                                                 \
+      T,                                                                                   \
+      kRocmExecutionProvider,                                                              \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      LRN<T>);
+
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, float)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, double)
+REGISTER_KERNEL_VERSIONED_TYPED(1, 12, MLFloat16)
+
+REGISTER_KERNEL_TYPED(13, float)
+REGISTER_KERNEL_TYPED(13, double)
+REGISTER_KERNEL_TYPED(13, MLFloat16)
+
+template <typename T>
+LRN<T>::LRN(const OpKernelInfo& info) : RocmKernel(info) {
+  int64_t size;
+  ORT_ENFORCE(info.GetAttr<int64_t>("size", &size).IsOK());
+  ORT_ENFORCE(size > 0);
+  ORT_ENFORCE(size % 2 == 1);
+
+  float alpha;
+  float beta;
+  ORT_ENFORCE(info.GetAttr<float>("alpha", &alpha).IsOK());
+  ORT_ENFORCE(alpha > 0.0f);
+  ORT_ENFORCE(info.GetAttr<float>("beta", &beta).IsOK());
+  ORT_ENFORCE(beta > 0.0f);
+  float bias = info.GetAttrOrDefault<float>("bias", 1.0f);
+
+  ORT_ENFORCE(norm_desc_.Set(
+                            gsl::narrow_cast<uint32_t>(size),
+                            static_cast<double>(alpha),
+                            static_cast<double>(beta),
+                            static_cast<double>(bias))
+                  .IsOK());
+}
+
+template <typename T>
+Status LRN<T>::ComputeInternal(OpKernelContext* context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+
+  const Tensor* X = context->Input<Tensor>(0);
+
+  auto rank = X->Shape().NumDimensions();
+  if (rank != 4 && rank != 5)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "miopen LRN only supports 4D or 5D input");
+
+  Tensor* Y = context->Output(0, X->Shape());
+
+  MiopenTensor x_tensor;
+  ORT_RETURN_IF_ERROR(x_tensor.Set(X->Shape().GetDims(), MiopenTensor::GetDataType<HipT>()));
+
+  const auto one = Consts<HipT>::One;
+  const auto zero = Consts<HipT>::Zero;
+
+  MIOPEN_RETURN_IF_ERROR(LRNCrossChannelForwardHelper(
+      MiopenHandle(),
+      norm_desc_,
+      miopenLRNCrossChannel,
+      &one,
+      x_tensor,
+      reinterpret_cast<const HipT*>(X->Data<T>()),
+      &zero,
+      x_tensor,
+      reinterpret_cast<HipT*>(Y->MutableData<T>())));
+
+  return Status::OK();
+}
+
+MiopenLRNDescriptor::MiopenLRNDescriptor() : desc_(nullptr) {
+}
+
+MiopenLRNDescriptor::~MiopenLRNDescriptor() {
+  if (desc_) {
+    miopenDestroyLRNDescriptor(desc_);
+    desc_ = nullptr;
+  }
+}
+
+Status MiopenLRNDescriptor::Set(uint32_t N, double alpha, double beta, double K) {
+  if (!desc_)
+    MIOPEN_RETURN_IF_ERROR(miopenCreateLRNDescriptor(&desc_));
+
+  MIOPEN_RETURN_IF_ERROR(SetLRNDescriptorHelper(desc_, N, alpha, beta, K));
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/rocm/miopen_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+class MiopenLRNDescriptor final {
+ public:
+  MiopenLRNDescriptor();
+  ~MiopenLRNDescriptor();
+  Status Set(uint32_t N, double alpha, double beta, double K);
+  operator miopenLRNDescriptor_t() const { return desc_; }
+
+ private:
+  miopenLRNDescriptor_t desc_;
+};
+
+template <typename T>
+class LRN : public RocmKernel {
+ public:
+  LRN(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
+
+ private:
+  MiopenLRNDescriptor norm_desc_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime