add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "max_pool_with_index.h"
+#include <cfloat>
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/shared_inc/fast_divmod.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+__global__ void MaxPoolWithIndexKernel(
+    int64_t batch,
+    int64_t channels,
+    int64_t height,
+    int64_t width,
+    int64_t depth,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t pooled_depth,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t kernel_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t stride_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    int64_t pad_d,
+    int64_t dilation_h,
+    int64_t dilation_w,
+    int64_t dilation_d,
+    fast_divmod fdm_c,
+    fast_divmod fdm_h,
+    fast_divmod fdm_w,
+    fast_divmod fdm_d,
+    int64_t storage_order,
+    const T* p_input,
+    int64_t output_size,
+    T* p_output,
+    int64_t* p_indices) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id >= output_size) return;
+  int d_index, w_index, h_index, c_index, n_index, id_tmp;
+  fdm_d.divmod(id, id_tmp, d_index);
+  fdm_w.divmod(id_tmp, id_tmp, w_index);
+  fdm_h.divmod(id_tmp, id_tmp, h_index);
+  fdm_c.divmod(id_tmp, n_index, c_index);
+  int64_t d_start = d_index * stride_d - pad_d;
+  int64_t w_start = w_index * stride_w - pad_w;
+  int64_t h_start = h_index * stride_h - pad_h;
+  int64_t d_end = _Min<int64_t>(d_start + (kernel_d - 1) * dilation_d + 1, depth);
+  int64_t w_end = _Min<int64_t>(w_start + (kernel_w - 1) * dilation_w + 1, width);
+  int64_t h_end = _Min<int64_t>(h_start + (kernel_h - 1) * dilation_h + 1, height);
+  d_start = _Max<int64_t>(d_start, 0);
+  w_start = _Max<int64_t>(w_start, 0);
+  h_start = _Max<int64_t>(h_start, 0);
+  int64_t d_index_max = -1;
+  int64_t w_index_max = -1;
+  int64_t h_index_max = -1;
+  int64_t offset = (n_index * channels + c_index) * height * width * depth;
+  const T* p_slice = p_input + offset;
+  T maxval = p_slice[h_start * width * depth + w_start * depth + d_start] - (T)1;
+  for (int64_t d = d_start; d < d_end; d += dilation_d) {
+    for (int64_t w = w_start; w < w_end; w += dilation_w) {
+      for (int64_t h = h_start; h < h_end; h += dilation_h) {
+        if (p_slice[h * width * depth + w * depth + d] > maxval) {
+          h_index_max = h;
+          w_index_max = w;
+          d_index_max = d;
+          maxval = static_cast<float>(p_slice[h * width * depth + w * depth + d]);
+        }
+      }
+    }
+  }
+  p_output[id] = p_input[offset + h_index_max * width * depth + w_index_max * depth + d_index_max];
+  if (p_indices) {
+    p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
+                                       : offset + h_index_max + w_index_max * height + d_index_max * width * height;
+  }
+}
+template <typename T>
+void MaxPoolWithIndex(
+    hipStream_t stream,
+    const TensorShape& input_shape,
+    const TensorShape& output_shape,
+    const gsl::span<const int64_t>& kernel_shape,
+    const gsl::span<const int64_t>& stride_shape,
+    const gsl::span<const int64_t>& pads,
+    const gsl::span<const int64_t>& dilations,
+    int64_t storage_order,
+    const T* p_input,
+    T* p_output,
+    int64_t* p_indices) {
+  int64_t batchs = input_shape[0];
+  int64_t channels = input_shape[1];
+  int64_t height = input_shape[2];
+  int64_t width = kernel_shape.size() > 1 ? input_shape[3] : 1;
+  int64_t depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
+  int64_t pooled_height = output_shape[2];
+  int64_t pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
+  int64_t pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
+  int64_t kernel_h = kernel_shape[0];
+  int64_t kernel_w = kernel_shape.size() > 1 ? kernel_shape[1] : 1;
+  int64_t kernel_d = kernel_shape.size() > 2 ? kernel_shape[2] : 1;
+  int64_t stride_h = stride_shape[0];
+  int64_t stride_w = stride_shape.size() > 1 ? stride_shape[1] : 1;
+  int64_t stride_d = stride_shape.size() > 2 ? stride_shape[2] : 1;
+  //pads in the format of [x1_begin, x2_begin...x1_end, x2_end,...],
+  //where xi_begin the number of pixels added at the beginning of axis i
+  //and xi_end, the number of pixels added at the end of axis i.
+  int64_t pad_h = pads[0];
+  int64_t pad_w = pads.size() >= 4 ? pads[1] : 0;
+  int64_t pad_d = pads.size() == 6 ? pads[2] : 0;
+  int64_t dilation_h = dilations[0];
+  int64_t dilation_w = dilations.size() >= 2 ? dilations[1] : 1;
+  int64_t dilation_d = dilations.size() == 3 ? dilations[2] : 1;
+  int64_t output_size = output_shape.Size();
+  fast_divmod fdm_c(static_cast<int>(channels));
+  fast_divmod fdm_h(static_cast<int>(pooled_height));
+  fast_divmod fdm_w(static_cast<int>(pooled_width));
+  fast_divmod fdm_d(static_cast<int>(pooled_depth));
+  int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
+  hipLaunchKernelGGL(MaxPoolWithIndexKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      batchs,
+      channels,
+      height,
+      width,
+      depth,
+      pooled_height,
+      pooled_width,
+      pooled_depth,
+      kernel_h,
+      kernel_w,
+      kernel_d,
+      stride_h,
+      stride_w,
+      stride_d,
+      pad_h,
+      pad_w,
+      pad_d,
+      dilation_h,
+      dilation_w,
+      dilation_d,
+      fdm_c,
+      fdm_h,
+      fdm_w,
+      fdm_d,
+      storage_order,
+      p_input,
+      output_size,
+      p_output,
+      p_indices);
+}
+#define INSTANTIATEMAXPOOLWITHINDEX(T)              \
+  template void MaxPoolWithIndex<T>(                \
+      hipStream_t stream,                          \
+      const TensorShape& input_shape,               \
+      const TensorShape& output_shape,              \
+      const gsl::span<const int64_t>& kernel_shape, \
+      const gsl::span<const int64_t>& stride_shape, \
+      const gsl::span<const int64_t>& pads,         \
+      const gsl::span<const int64_t>& dilations,    \
+      int64_t storage_order,                        \
+      const T* p_input,                             \
+      T* p_output,                                  \
+      int64_t* p_indices);
+INSTANTIATEMAXPOOLWITHINDEX(float)
+INSTANTIATEMAXPOOLWITHINDEX(double)
+INSTANTIATEMAXPOOLWITHINDEX(half)
+INSTANTIATEMAXPOOLWITHINDEX(int8_t)
+INSTANTIATEMAXPOOLWITHINDEX(uint8_t)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/max_pool_with_index.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include <vector>
+#include "core/framework/tensor_shape.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+void MaxPoolWithIndex(
+    hipStream_t stream,
+    const TensorShape& input_shape,
+    const TensorShape& output_shape,
+    const gsl::span<const int64_t>& kernel_shape,
+    const gsl::span<const int64_t>& stride_shape,
+    const gsl::span<const int64_t>& pads,
+    const gsl::span<const int64_t>& dilations,
+    int64_t storage_order,
+    const T* p_input,
+    T* p_output,
+    int64_t* p_indices);
+}  //namespace rocm
+}  //namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/nn/pool.h"
+#include "core/providers/rocm/miopen_common.h"
+#include "core/providers/rocm/nn/max_pool_with_index.h"
+#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
+using namespace onnxruntime::common;
+namespace onnxruntime {
+namespace rocm {
+#define POOLING_KERNEL(op_name, data_type, pool_type, since_version)                               \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                   \
+      op_name,                                                                                     \
+      kOnnxDomain,                                                                                 \
+      since_version,                                                                               \
+      data_type,                                                                                   \
+      kRocmExecutionProvider,                                                                      \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
+      Pool<data_type, pool_type>);
+#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                  \
+      op_name,                                                                              \
+      kOnnxDomain,                                                                          \
+      since_version,                                                                        \
+      end_version,                                                                          \
+      data_type,                                                                            \
+      kRocmExecutionProvider,                                                               \
+      (*KernelDefBuilder::Create())                                                         \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                   \
+      Pool<data_type, pool_type>);
+#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                  \
+      op_name,                                                                    \
+      kOnnxDomain,                                                                \
+      since_version,                                                              \
+      data_type,                                                                  \
+      kRocmExecutionProvider,                                                     \
+      (*KernelDefBuilder::Create())                                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())          \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),           \
+      Pool<data_type, pool_type>);
+#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                               \
+      op_name,                                                                                           \
+      kOnnxDomain,                                                                                       \
+      since_version,                                                                                     \
+      end_version,                                                                                       \
+      data_type,                                                                                         \
+      kRocmExecutionProvider,                                                                            \
+      (*KernelDefBuilder::Create())                                                                      \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                 \
+          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                  \
+      Pool<data_type, pool_type>);
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9)
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10)
+//AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides.
+POOLING_KERNEL(AveragePool, float, AveragePool, 11)
+POOLING_KERNEL(AveragePool, double, AveragePool, 11)
+POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11)
+POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
+POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1)
+POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1)
+POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 11, 11)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11)
+POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, double, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12)
+POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12)
+POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
+POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1)
+POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1)
+class MiopenPoolingDescriptor final {
+ public:
+  MiopenPoolingDescriptor() : desc_(nullptr) {
+  }
+  ~MiopenPoolingDescriptor() {
+    if (desc_ != nullptr) {
+      miopenDestroyPoolingDescriptor(desc_);
+      desc_ = nullptr;
+    }
+  }
+  MiopenPoolingDescriptor(const MiopenPoolingDescriptor&) = delete;
+  MiopenPoolingDescriptor& operator=(const MiopenPoolingDescriptor&) = delete;
+  Status Set(miopenPoolingMode_t mode,
+             const gsl::span<const int64_t>& kernel_shape,
+             const gsl::span<const int64_t>& pads,
+             const gsl::span<const int64_t>& strides) {
+    if (!desc_)
+      MIOPEN_RETURN_IF_ERROR(miopenCreatePoolingDescriptor(&desc_));
+    int rank = gsl::narrow_cast<int>(kernel_shape.size());
+    InlinedVector<int> window(rank);
+    InlinedVector<int> padding(rank);
+    InlinedVector<int> stride(rank);
+    for (int i = 0; i < rank; i++) {
+      window[i] = gsl::narrow_cast<int>(kernel_shape[i]);
+    }
+    for (int i = 0; i < rank; i++) {
+      padding[i] = gsl::narrow_cast<int>(pads[i]);
+    }
+    for (int i = 0; i < rank; i++) {
+      stride[i] = gsl::narrow_cast<int>(strides[i]);
+    }
+    MIOPEN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper(
+        desc_,
+        mode,
+        MIOPEN_PROPAGATE_NAN,
+        rank,
+        window.data(),
+        padding.data(),
+        stride.data()));
+    return Status::OK();
+  }
+  operator miopenPoolingDescriptor_t() const { return desc_; }
+ private:
+  miopenPoolingDescriptor_t desc_;
+};
+template <typename T, typename PoolType>
+Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+  const Tensor* X = context->Input<Tensor>(0);
+  const TensorShape& x_shape = X->Shape();
+  const auto x_dims = x_shape.GetDims();
+  if (x_shape.NumDimensions() < 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input dimension cannot be less than 3.");
+  }
+  auto kernel_shape = pool_attrs_.kernel_shape;
+  auto pads = pool_attrs_.pads;
+  auto strides = pool_attrs_.strides;
+  if (pool_attrs_.global_pooling) {
+    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    pads.assign(kernel_shape.size(), 0);
+    strides.assign(kernel_shape.size(), 1);
+  }
+  auto y_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
+  TensorShape y_shape(y_dims);
+  Tensor* Y = context->Output(0, y_shape);
+  // special case when there is a dim value of 0 in the shape.
+  if (y_shape.Size() == 0)
+    return Status::OK();
+  auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
+  auto y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
+  TensorShapeVector x_dims_miopen(x_dims.begin(), x_dims.end());
+  TensorShapeVector y_dims_miopen(y_dims);
+  if (kernel_shape.size() < 2) {
+    // miopen only takes 4D or 5D input, so pad dimensions if needed
+    x_dims_miopen.push_back(1);
+    y_dims_miopen.push_back(1);
+    pads.insert(pads.begin() + kernel_shape.size(), 0);
+    pads.insert(pads.end(), 0);
+    kernel_shape.push_back(1);
+    strides.push_back(1);
+  }
+  miopenPoolingMode_t mode = miopenPoolingMax;
+  if constexpr (PoolType::type == onnxruntime::PoolType::kAveragePool) {
+    mode = pool_attrs_.count_include_pad ? miopenPoolingAverageInclusive
+                                         : miopenPoolingAverage;
+  }
+  MiopenPoolingDescriptor pooling_desc;
+  ORT_RETURN_IF_ERROR(pooling_desc.Set(mode, kernel_shape, pads, strides));
+  if constexpr (std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value) {
+    // Cast to float back and forth using temp buffer
+    const auto alpha = Consts<float>::One;
+    const auto beta = Consts<float>::Zero;
+    MiopenTensor x_tensor;
+    MiopenTensor y_tensor;
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_miopen, MiopenTensor::GetDataType<float>()));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_miopen, MiopenTensor::GetDataType<float>()));
+    const auto input_count = x_shape.Size();
+    const auto output_count = y_shape.Size();
+    IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count);
+    auto temp_Y = GetScratchBuffer<float>(output_count);
+    Impl_Cast<HipT, float>(Stream(), reinterpret_cast<const HipT*>(x_data), temp_X.get(), input_count);
+    MIOPEN_RETURN_IF_ERROR(PoolingForwardHelper(MiopenHandle(), pooling_desc, &alpha,
+                                               x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get()));
+    Impl_Cast<float, HipT>(Stream(), temp_Y.get(), y_data, output_count);
+  } else {
+    const auto alpha = Consts<HipT>::One;
+    const auto beta = Consts<HipT>::Zero;
+    MiopenTensor x_tensor;
+    MiopenTensor y_tensor;
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_miopen, MiopenTensor::GetDataType<HipT>()));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_miopen, MiopenTensor::GetDataType<HipT>()));
+    MIOPEN_RETURN_IF_ERROR(PoolingForwardHelper(MiopenHandle(), pooling_desc, &alpha,
+                                               x_tensor, x_data, &beta, y_tensor, y_data));
+  }
+  return Status::OK();
+}
+template <typename T>
+Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+  const Tensor* X = context->Input<Tensor>(0);
+  const TensorShape& x_shape = X->Shape();
+  const auto& x_dims = x_shape.GetDims();
+  if (x_shape.NumDimensions() < 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input dimension cannot be less than 3.");
+  }
+  auto kernel_shape = this->pool_attrs_.kernel_shape;
+  auto pads = this->pool_attrs_.pads;
+  auto strides = this->pool_attrs_.strides;
+  if (this->pool_attrs_.global_pooling) {
+    kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
+    pads.assign(kernel_shape.size(), 0);
+    strides.assign(kernel_shape.size(), 1);
+  }
+  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
+  Tensor* Y = context->Output(0, TensorShape(y_dims));
+  // special case when there is a dim value of 0 in the shape.
+  if (Y->Shape().Size() == 0)
+    return Status::OK();
+  auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
+  auto y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
+  Tensor* I = context->Output(1, TensorShape(y_dims));
+  if (nullptr != I || !this->pool_attrs_.default_dilations) {
+    auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
+    MaxPoolWithIndex<HipT>(
+        this->Stream(),
+        x_shape,
+        TensorShape(y_dims),
+        kernel_shape,
+        strides,
+        pads,
+        this->pool_attrs_.dilations,
+        this->pool_attrs_.storage_order,
+        x_data,
+        y_data,
+        i_data);
+  } else {
+    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>>::ComputeInternal(context)));
+  }
+  return Status::OK();
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/pool.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/rocm/miopen_common.h"
+#include "core/providers/cpu/nn/pool_base.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T, typename PoolType>
+class Pool : public RocmKernel, public PoolBase {
+ public:
+  Pool(const OpKernelInfo& info) : RocmKernel(info), PoolBase(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+template <typename T>
+class Pool<T, MaxPool<8>> final : public Pool<T, MaxPool<1>> {
+ public:
+  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>>(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "shrink.h"
+#include "shrink_impl.h"
+#include "core/providers/common.h"
+using namespace std;
+namespace onnxruntime {
+namespace rocm {
+#define SHRINK_REGISTER_KERNEL(T)                                 \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      Shrink,                                                     \
+      kOnnxDomain,                                                \
+      9,                                                          \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .MayInplace(0, 0)                                       \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Shrink<T>);
+template <typename T>
+Status Shrink<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+  const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
+  const auto* x_data = reinterpret_cast<const HipT*>(X->Data<T>());
+  const TensorShape& x_shape = X->Shape();
+  const size_t x_size = static_cast<size_t>(x_shape.Size());
+  Tensor* Y = p_op_kernel_context->Output(0, x_shape);
+  auto* y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
+  ShrinkImpl<HipT>(Stream(), x_data, bias_, lambd_, y_data, x_size);
+  return Status::OK();
+}
+SHRINK_REGISTER_KERNEL(float)
+SHRINK_REGISTER_KERNEL(double)
+SHRINK_REGISTER_KERNEL(MLFloat16)
+SHRINK_REGISTER_KERNEL(uint8_t)
+SHRINK_REGISTER_KERNEL(int8_t)
+SHRINK_REGISTER_KERNEL(uint16_t)
+SHRINK_REGISTER_KERNEL(int16_t)
+SHRINK_REGISTER_KERNEL(uint32_t)
+SHRINK_REGISTER_KERNEL(int32_t)
+SHRINK_REGISTER_KERNEL(uint64_t)
+SHRINK_REGISTER_KERNEL(int64_t)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/rocm/rocm_kernel.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+class Shrink final : public RocmKernel {
+ public:
+  Shrink(const OpKernelInfo& info) : RocmKernel(info) {
+    float bias_temp;
+    // if the attribute exists, use the value
+    if (info.GetAttr<float>("bias", &bias_temp).IsOK())
+      bias_ = bias_temp;
+    float lambd_temp;
+    // if the attribute exists, use the value
+    if (info.GetAttr<float>("lambd", &lambd_temp).IsOK())
+      lambd_ = lambd_temp;
+  }
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const;
+ private:
+  float bias_ = 0.0f;   // default as per spec
+  float lambd_ = 0.5f;  // default as per spec
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "shrink_impl.h"
+namespace onnxruntime {
+namespace rocm {
+// Generic implementation of Shrink
+template <typename T>
+__global__ void _ShrinkKernel(
+    const T* input_data,
+    const float bias,
+    const float lambda,
+    T* output_data,
+    const HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  T x = input_data[id];
+  if (x < -lambda) {
+    output_data[id] = (T)(x + bias);
+  } else if (x > lambda) {
+    output_data[id] = (T)(x - bias);
+  } else {
+    output_data[id] = (T)0;
+  }
+}
+// Specialized implementation for 'half' type
+// the idea is to convert 'half' data to 'float' first,
+// do the operation and convert result back to 'half'
+template <>
+__global__ void _ShrinkKernel(
+    const half* input_data,
+    const float bias,
+    const float lambda,
+    half* output_data,
+    const HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  half x = input_data[id];
+  if ((float)x < -lambda) {
+    output_data[id] = half((float)x + bias);
+  } else if ((float)x > lambda) {
+    output_data[id] = half((float)x - bias);
+  } else {
+    output_data[id] = (half)0;
+  }
+}
+template <typename T>
+void ShrinkImpl(
+    hipStream_t stream,
+    const T* input_data,
+    const float bias,
+    const float lambda,
+    T* output_data,
+    size_t N) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_ShrinkKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      input_data, bias, lambda, output_data, (HIP_LONG)N);
+}
+#define SPECIALIZED_IMPL(T) \
+  template void ShrinkImpl<T>(hipStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N);
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double)
+SPECIALIZED_IMPL(half)
+SPECIALIZED_IMPL(uint8_t)
+SPECIALIZED_IMPL(int8_t)
+SPECIALIZED_IMPL(uint16_t)
+SPECIALIZED_IMPL(int16_t)
+SPECIALIZED_IMPL(uint32_t)
+SPECIALIZED_IMPL(int32_t)
+SPECIALIZED_IMPL(uint64_t)
+SPECIALIZED_IMPL(int64_t)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/shrink_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+void ShrinkImpl(
+    hipStream_t stream,
+    const T* input_data,
+    const float bias,
+    const float lambda,
+    T* output_data,
+    size_t count);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#ifdef ENABLE_NVTX_PROFILE
+#include "nvtx_profile.h"
+#include "core/common/common.h"
+#include <nvToolsExt.h>
+#include <nvToolsExtCuda.h>
+namespace onnxruntime {
+namespace profile {
+void NvtxRangeCreator::BeginImpl() {
+// enable only for debug builds because this function is for profiling only.
+  nvtxEventAttributes_t eventAttrib;
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(color_);
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = message_.c_str();
+  range_id_ = nvtxRangeStartEx(&eventAttrib);
+}
+void NvtxRangeCreator::EndImpl() {
+// enable only for debug builds because this function is for profiling only.
+  nvtxRangeEnd(range_id_);
+}
+void NvtxNestedRangeCreator::BeginImpl() {
+// enable only for debug builds because this function is for profiling only.
+  nvtxEventAttributes_t eventAttrib;
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(color_);
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = message_.c_str();
+  nvtxRangePushEx(&eventAttrib);
+}
+void NvtxNestedRangeCreator::EndImpl() {
+// enable only for debug builds because this function is for profiling only.
+  nvtxRangePop();
+}
+void NvtxMarkerCreator::Mark() {
+// enable only for debug builds because this function is for profiling only.
+  nvtxEventAttributes_t eventAttrib; 
+  eventAttrib.version = NVTX_VERSION; 
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; 
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(color_);
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = message_.c_str();
+  nvtxMarkEx(&eventAttrib); 
+}
+}  // namespace contrib
+}  // namespace onnxruntime
+#endif
\ No newline at end of file
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+// These enclosed classes are wrappers over
+// generating Nvidia's visual profile APIs.
+// They can be used to plot the time intervals of forward and backward passes.
+// They can also be used to plot the time span of a specific operator.
+// When writing this file, Nvidia only supports this tool on Linux.
+#ifdef ENABLE_NVTX_PROFILE
+#pragma once
+#include <cinttypes>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include "core/common/common.h"
+namespace onnxruntime {
+namespace profile {
+// Color in ARGB space.
+// A: first 8 bit.
+// R: later 8 bit.
+// G: later 8 bit.
+// B: last 8 bits
+// All colo channels has range [0, 255].
+enum class Color : uint32_t {
+  Black = 0x00000000,
+  Red = 0x00ff0000,
+  DarkGreen = 0x00009900,
+  Green = 0x0000ff00,
+  LightGreen = 0x00ccffcc,
+  Blue = 0x000000ff,
+  Amber = 0x00ffbf00,
+  LightAmber = 0x00fff2cc,
+  White = 0x00ffffff,
+  Cyan = 0x0000ffff,
+  Magenta = 0x00ff00ff,
+  Yellow = 0x00ffff00,
+};
+class RangeCreatorBase {
+ public:
+  RangeCreatorBase(const std::string message, const Color color)
+      : message_(message), color_(color),
+        is_begin_called_(false), is_end_called_(false) {};
+  // Check if Begin and End are both called.
+  // It's pointless if not all of them are called.
+  ~RangeCreatorBase() {
+    if (!is_begin_called_) {
+      std::cerr << "Begin must be called once." << std::endl;
+    }
+    if (!is_end_called_) {
+      std::cerr << "End must be called once." << std::endl;
+    }
+  }
+  // Mark the beginning of a range.
+  void Begin() {
+    ORT_ENFORCE(!is_begin_called_, "Begin cannot be called more than once.");
+    ORT_ENFORCE(!is_end_called_, "Begin cannot be called after calling End.");
+    BeginImpl();
+    is_begin_called_ = true;
+  }
+  // Mark the end of a range.
+  void End() {
+    ORT_ENFORCE(is_begin_called_, "End must be called after calling Begin.");
+    ORT_ENFORCE(!is_end_called_, "End cannot be called more than once.");
+    EndImpl();
+    is_end_called_ = true;
+  }
+  bool IsBeginCalled() const {
+    return is_begin_called_;
+  } 
+  bool IsEndCalled() const {
+    return is_end_called_;
+  }
+  virtual void BeginImpl() = 0;
+  virtual void EndImpl() = 0;
+ protected:
+  // Text on this event.
+  const std::string message_;
+  // Color of event in ARGB space.
+  const Color color_;
+  bool is_begin_called_;
+  bool is_end_called_;
+};
+class NvtxRangeCreator final : public RangeCreatorBase {
+ public:
+  NvtxRangeCreator(const std::string message, const Color color)
+      : RangeCreatorBase(message, color) {};
+  void BeginImpl() override;
+  void EndImpl() override;
+ private:
+  // It records the event ID created by BeginImpl.
+  // EndImpl needs this value to end the right event.
+  uint64_t range_id_;
+};
+class NvtxNestedRangeCreator final : public RangeCreatorBase {
+ public:
+  NvtxNestedRangeCreator(const std::string message, const Color color)
+      : RangeCreatorBase(message, color) {};
+  void BeginImpl() override;
+  void EndImpl() override;
+};
+class NvtxMarkerCreator final {
+ public:
+  NvtxMarkerCreator(const std::string message, const Color color)
+      : message_(message), color_(color) {};
+  void Mark();
+ private:
+  // Text on this marker.
+  const std::string message_;
+  // See nvtxRangeCreator.color_.
+  const Color color_;
+};
+}  // namespace profile
+}  // namespace onnxruntime
+#endif
\ No newline at end of file
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile_context.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nvtx_profile_context.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <thread>
+#include <string>
+#include <unordered_map>
+#include "core/platform/ort_mutex.h"
+#ifdef ENABLE_NVTX_PROFILE
+namespace onnxruntime {
+namespace profile {
+// Singleton class of managing global NVTX profiling information.
+class Context {
+ public:
+  static Context& GetInstance() {
+    static Context instance_;
+    return instance_;
+  }
+  // Return tag for the specified thread.
+  // If the thread's tag doesn't exist, this function returns an empty string.
+  std::string GetThreadTagOrDefault(const std::thread::id& thread_id) {
+    const std::lock_guard<OrtMutex> lock(mtx_);
+    return thread_tag_[thread_id];
+  }
+  // Set tag for the specified thread.
+  void SetThreadTag(
+      const std::thread::id& thread_id, const std::string& tag) {
+    const std::lock_guard<OrtMutex> lock(mtx_);
+    thread_tag_[thread_id] = tag;
+  }
+ private:
+  Context() = default;
+  ~Context() = default;
+  Context(const Context&) = delete;
+  Context& operator=(const Context&) = delete;
+  // map from thread's id to its human-readable tag.
+  std::unordered_map<std::thread::id, std::string> thread_tag_;
+  OrtMutex mtx_;
+};
+}  // namespace profile
+}  // namespace onnxruntime
+#endif
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "non_max_suppression.h"
+#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
+#include "non_max_suppression_impl.h"
+#include "core/providers/rocm/tensor/concat_impl.h"
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    NonMaxSuppression,
+    kOnnxDomain,
+    10, 10,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .InputMemoryType(OrtMemTypeCPUInput, 4),
+    NonMaxSuppression);
+ONNX_OPERATOR_KERNEL_EX(
+    NonMaxSuppression,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .InputMemoryType(OrtMemTypeCPUInput, 4),
+    NonMaxSuppression);
+Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const {
+  PrepareContext pc;
+  ORT_RETURN_IF_ERROR(PrepareCompute(ctx, pc));
+  int64_t max_output_boxes_per_class = 0;
+  float iou_threshold = .0f;
+  float score_threshold = .0f;
+  ORT_RETURN_IF_ERROR(GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold));
+  if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) {
+    ctx->Output(0, {0, 3});
+    return Status::OK();
+  }
+  // TODO: use hipcub::DeviceSegmentedRadixSort::SortPairsDescending instead of hipcub::DeviceRadixSort::SortPairsDescending
+  //       to deal with multi batch/class parallelly
+  std::vector<std::tuple<IAllocatorUniquePtr<void>, int>> all_selected_indices;
+  int total_num_saved_outputs = 0;
+  // safe downcast max_output_boxes_per_class to int as hipcub::DeviceSelect::Flagged() does not support int64_t
+  int int_max_output_boxes_per_class = max_output_boxes_per_class > std::numeric_limits<int>::max()
+                                           ? std::numeric_limits<int>::max()
+                                           : static_cast<int>(max_output_boxes_per_class);
+  for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) {
+    for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) {
+      IAllocatorUniquePtr<void> d_selected_indices{};
+      IAllocatorUniquePtr<void> h_number_selected_ptr{AllocateBufferOnCPUPinned<void>(sizeof(int))};
+      auto* h_number_selected = static_cast<int*>(h_number_selected_ptr.get());
+      ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl(
+          Stream(),
+          [this](size_t bytes) { return GetScratchBuffer<void>(bytes); },
+          pc,
+          GetCenterPointBox(),
+          batch_index,
+          class_index,
+          int_max_output_boxes_per_class,
+          iou_threshold,
+          score_threshold,
+          d_selected_indices,
+          h_number_selected));
+      int num_saved_outputs = *h_number_selected;
+      if (num_saved_outputs > 0) {
+        all_selected_indices.emplace_back(std::move(d_selected_indices), num_saved_outputs);
+        total_num_saved_outputs += num_saved_outputs;
+      }
+    }
+  }
+  if (total_num_saved_outputs == 0) {
+    ctx->Output(0, {0, 3});
+  } else {
+    // concatenate outputs
+    constexpr int last_dim = 3;
+    const int num_elements = last_dim * total_num_saved_outputs;
+    Tensor* output = ctx->Output(0, {static_cast<int64_t>(total_num_saved_outputs), last_dim});
+    ORT_ENFORCE(output != nullptr);
+    int64_t* dst = output->MutableData<int64_t>();
+    size_t count = all_selected_indices.size();
+    RocmAsyncBuffer<const void*> input_ptr(this, count);
+    RocmAsyncBuffer<int64_t> concat_sizes_gpu(this, count);
+    RocmAsyncBuffer<int64_t> concat_sizes_range_gpu(this, count);
+    RocmAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, total_num_saved_outputs);
+    int index = 0;
+    for (size_t i = 0; i < count; i++) {
+      auto& it = all_selected_indices[i];
+      auto src = std::get<0>(it).get();
+      auto size = std::get<1>(it);
+      input_ptr.CpuPtr()[i] = src;
+      concat_sizes_gpu.CpuPtr()[i] = size;
+      concat_sizes_range_gpu.CpuPtr()[i] = (i == 0) ? size : size + concat_sizes_range_gpu.CpuPtr()[i - 1];
+      for (int j = 0; j < size; j++) {
+        axis_dimension_input_output_mapping_gpu.CpuPtr()[index++] = i;
+      }
+    }
+    ORT_RETURN_IF_ERROR(concat_sizes_gpu.CopyToGpu());
+    ORT_RETURN_IF_ERROR(axis_dimension_input_output_mapping_gpu.CopyToGpu());
+    ORT_RETURN_IF_ERROR(concat_sizes_range_gpu.CopyToGpu());
+    ORT_RETURN_IF_ERROR(input_ptr.CopyToGpu());
+    ORT_RETURN_IF_ERROR(ConcatImpl(Stream(),
+                                   sizeof(int64_t),
+                                   num_elements,
+                                   last_dim,
+                                   concat_sizes_gpu.GpuPtr(),
+                                   concat_sizes_range_gpu.GpuPtr(),
+                                   axis_dimension_input_output_mapping_gpu.GpuPtr(),
+                                   dst,
+                                   input_ptr.GpuPtr(),
+                                   static_cast<size_t>(num_elements)));
+  }
+  return Status::OK();
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/object_detection/non_max_suppression.h"
+namespace onnxruntime {
+namespace rocm {
+struct NonMaxSuppression final : public RocmKernel, public NonMaxSuppressionBase {
+  explicit NonMaxSuppression(const OpKernelInfo& info) : RocmKernel(info), NonMaxSuppressionBase(info) {
+  }
+  Status ComputeInternal(OpKernelContext* context) const override;
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NonMaxSuppression);
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.cu
+#include "hip/hip_runtime.h"
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* Modifications Copyright (c) Microsoft. */
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include "non_max_suppression_impl.h"
+#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/rocm_common.h"
+#include <hipcub/hipcub.hpp>
+//TODO:fix the warnings
+#ifdef _MSC_VER
+#pragma warning(disable : 4244)
+#endif
+namespace onnxruntime {
+namespace rocm {
+using namespace nms_helpers;
+namespace {
+struct __align__(16) Box {
+  float x1, y1, x2, y2;
+};
+// This is the width of the bitmask for masking boxes for each thread.
+// This needs to be a multiple of 2(a POD width usually) so that division and
+// modulo can be implemented as bit operations during host selection.
+constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
+// Helper to calculate modulo mask and shift bits.
+// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
+// i % 32 == i & 31. Similarly ShiftBits will be 5 so that
+// i / 32 == i >> 5. Using these bit operations should reduce the stall on host
+// thread.
+__device__ constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
+constexpr int kNmsBlockDim = 16;
+constexpr int kNmsBlockDimMax = 128;
+// Check whether two boxes have an IoU greater than threshold.
+template <typename T>
+__device__ inline bool OverThreshold(const Box* a, const Box* b,
+                                     const float a_area,
+                                     const T iou_threshold) {
+  const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
+  if (a_area == 0.0f || b_area == 0.0f) return false;
+  const float xx1 = fmaxf(a->x1, b->x1);
+  const float yy1 = fmaxf(a->y1, b->y1);
+  const float xx2 = fminf(a->x2, b->x2);
+  const float yy2 = fminf(a->y2, b->y2);
+  // fdimf computes the positive difference between xx2+1 and xx1.
+  const float w = fdimf(xx2, xx1);
+  const float h = fdimf(yy2, yy1);
+  const float intersection = w * h;
+  // Testing for aa/bb > t
+  // eq with aa > bb*t (b is !=0)
+  // avoiding divisions.
+  const float aa = intersection;
+  const float bb = a_area + b_area - intersection;
+  const float bt = bb * iou_threshold;
+  return aa >= bt;
+}
+template <typename T>
+__device__ inline bool CheckBit(T* bit_mask, int bit) {
+  constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
+  constexpr int kRemainderMask = 8 * sizeof(T) - 1;
+  int bin = bit >> kShiftLen;
+  return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
+}
+// Produce a global bitmask (result_mask) of selected boxes from bitmask
+// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
+// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
+__global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
+                          const int num_boxes, const int max_boxes,
+                          char* result_mask) {
+  extern __shared__ int local[];
+  // set global mask to accept all boxes
+  for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < bit_mask_len; box += blockDim.x * gridDim.x) {
+    local[box] = 0xFFFFFFFF;
+  }
+  __syncthreads();
+  int accepted_boxes = 0;
+  for (int box = 0; box < num_boxes - 1; ++box) {
+    // if current box is masked by an earlier box, skip it.
+    if (!CheckBit(local, box)) {
+      continue;
+    }
+    accepted_boxes += 1;
+    int offset = box * bit_mask_len;
+    // update global mask with current box's mask
+    for (int b = blockIdx.x * blockDim.x + threadIdx.x; b < bit_mask_len; b += blockDim.x * gridDim.x) {
+      local[b] &= ~bitmask[offset + b];
+    }
+    __syncthreads();
+    if (accepted_boxes > max_boxes) break;
+  }
+  // copy global mask to result_max char array. char array is needed for
+  // hipcub::DeviceSelect later.
+  for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < num_boxes; box += blockDim.x * gridDim.x) {
+    result_mask[box] = CheckBit(local, box);
+  }
+}
+// For each box, compute a bitmask of boxes which has an overlap with given box
+// above threshold.
+//
+// Starting from highes scoring box, mark any box which has IoU>threshold with
+// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
+// each box has bitmask of overlaps of length bit_mask_len.
+//
+__launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
+    void NMSKernel(
+        const int64_t center_point_box,
+        const Box* d_desc_sorted_boxes,
+        const int num_boxes,
+        const float iou_threshold,
+        const int bit_mask_len,
+        int* d_delete_mask) {
+  for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
+       i_block_offset += blockDim.x * gridDim.x) {
+    const int i = i_block_offset + threadIdx.x;
+    if (i < num_boxes) {
+      for (int j_thread_offset =
+               kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
+           j_thread_offset < num_boxes;
+           j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
+        // Note : We can do everything using multiplication,
+        // and use fp16 - we are comparing against a low precision
+        // threshold.
+        int above_threshold = 0;
+        // Make sure that threads are within valid domain.
+        bool valid = false;
+        // Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
+        // if it is overlapping with current box
+        for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
+          // This thread will compare Box i and Box j.
+          const int j = j_thread_offset + ib;
+          if (i >= j || i >= num_boxes || j >= num_boxes) continue;
+          valid = true;
+          if (SuppressByIOU(reinterpret_cast<const float*>(d_desc_sorted_boxes),
+                            i, j, center_point_box, iou_threshold)) {
+            // we have score[j] <= score[i].
+            above_threshold |= (1U << ib);
+          }
+        }
+        if (valid) {
+          d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
+              above_threshold;
+        }
+      }
+    }
+  }
+}
+// Variadic template helpers for Index selecting multiple arrays at the same
+// time
+template <typename Index>
+__device__ inline void SelectHelper(const Index /*i_selected */,
+                                    const Index /* i_original */) {}
+template <typename Index, typename T, typename... Args>
+__device__ inline void SelectHelper(const Index i_selected,
+                                    const Index i_original,
+                                    const T* original, T* selected,
+                                    Args... args) {
+  selected[i_selected] = original[i_original];
+  SelectHelper(i_selected, i_original, args...);
+}
+// Helper template to select elements from original arrays using the index
+// mapping and store into selected array. Each array sharing same mapping need
+// to be passed as pairs of pointers to original and selected arrays. For
+// selecting 2 arrays call would be
+// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
+// selected2).
+template <typename Index, typename T, typename... Args>
+__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
+                                 const T* original, T* selected, Args... args) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
+    SelectHelper(idx, indices[idx], original, selected, args...);
+  }
+}
+template <typename T>
+__global__ void SetZero(const int count, T* __restrict__ ptr) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
+    ptr[i] = T(0);
+  }
+}
+template <typename T>
+__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
+    to_fill[idx] = static_cast<T>(idx) + offset;
+  }
+}
+__global__ void NormalizeOutput(const int num_elements, const int* original, int64_t* to_normalize, int64_t batch_index, int64_t class_index) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
+    to_normalize[idx * 3] = batch_index;
+    to_normalize[idx * 3 + 1] = class_index;
+    to_normalize[idx * 3 + 2] = static_cast<int64_t>(original[idx]);
+  }
+}
+Status NmsGpu(hipStream_t stream,
+              std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
+              const int64_t center_point_box,
+              const float* d_sorted_boxes_float_ptr,
+              const int num_boxes,
+              const float iou_threshold,
+              int* d_selected_indices,
+              int* h_nkeep,
+              const int max_boxes) {
+  // Making sure we respect the __align(16)__
+  // we promised to the compiler.
+  auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
+  ORT_ENFORCE((iptr & 15) == 0);
+  const int bit_mask_len =
+      (num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
+  int max_nms_mask_size = num_boxes * bit_mask_len;
+  IAllocatorUniquePtr<void> d_nms_mask_ptr{allocator(max_nms_mask_size * sizeof(int))};
+  auto* d_nms_mask = static_cast<int*>(d_nms_mask_ptr.get());
+  int blocksPerGrid = (int)(ceil(static_cast<float>(max_nms_mask_size) / GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(SetZero<int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, max_nms_mask_size, d_nms_mask);
+  int* d_delete_mask = d_nms_mask;
+  int* h_selected_count = h_nkeep;
+  const Box* d_sorted_boxes =
+      reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
+  dim3 block_dim, thread_block;
+  int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
+  num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
+  block_dim.x = num_blocks;
+  block_dim.y = num_blocks;
+  block_dim.z = 1;
+  thread_block.x = kNmsBlockDim;
+  thread_block.y = kNmsBlockDim;
+  thread_block.z = 1;
+  hipLaunchKernelGGL(NMSKernel, block_dim, thread_block, 0, stream, center_point_box,
+                                         d_sorted_boxes,
+                                         num_boxes,
+                                         iou_threshold,
+                                         bit_mask_len,
+                                         d_delete_mask);
+  IAllocatorUniquePtr<void> d_selected_boxes_ptr{allocator(num_boxes * sizeof(char))};
+  auto* d_selected_boxes = static_cast<char*>(d_selected_boxes_ptr.get());
+  IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_indices = static_cast<int*>(d_indices_ptr.get());
+  blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(Iota<int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_boxes, 0, d_indices);
+  NMSReduce<<<1, 1024, bit_mask_len * sizeof(int), stream>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes);
+  size_t flagged_buffer_size = 0;
+  HIP_RETURN_IF_ERROR(hipcub::DeviceSelect::Flagged(static_cast<void*>(nullptr),  // temp_storage
+                                                  flagged_buffer_size,
+                                                  static_cast<int*>(nullptr),   // input
+                                                  static_cast<char*>(nullptr),  // selection flag
+                                                  static_cast<int*>(nullptr),   // selected items
+                                                  static_cast<int*>(nullptr),   // num_selected
+                                                  num_boxes,
+                                                  stream));
+  IAllocatorUniquePtr<void> d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)};
+  auto* d_cub_scratch_buffer = static_cast<uint8_t*>(d_cub_scratch_buffer_ptr.get());
+  IAllocatorUniquePtr<void> d_num_selected_ptr{allocator(sizeof(int))};
+  auto* d_num_selected = static_cast<int*>(d_num_selected_ptr.get());
+  HIP_RETURN_IF_ERROR(hipcub::DeviceSelect::Flagged(
+      d_cub_scratch_buffer,  // temp_storage
+      flagged_buffer_size,
+      d_indices,           // input
+      d_selected_boxes,    // selection flag
+      d_selected_indices,  // selected items
+      d_num_selected, num_boxes, stream));
+  HIP_RETURN_IF_ERROR(hipMemcpyAsync(h_selected_count, d_num_selected, sizeof(int), hipMemcpyDeviceToHost, stream));
+  // hipStreamSynchronize is needed since the value of h_selected_count will be used by host after this function.
+  HIP_RETURN_IF_ERROR(hipStreamSynchronize(stream));
+  return Status::OK();
+}
+struct DeviceGreaterThan {
+  float threshold_;
+  __host__ __device__ __forceinline__ DeviceGreaterThan(float threshold)
+      : threshold_(threshold) {}
+  __host__ __device__ __forceinline__ bool operator()(const float& val) const {
+    return (val > threshold_);
+  }
+};
+}  // namespace
+Status NonMaxSuppressionImpl(
+    hipStream_t stream,
+    std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
+    const PrepareContext& pc,
+    const int64_t center_point_box,
+    int64_t batch_index,
+    int64_t class_index,
+    int max_output_boxes_per_class,
+    float iou_threshold,
+    float score_threshold,
+    IAllocatorUniquePtr<void>& selected_indices,
+    int* h_number_selected) {
+  // STEP 1. Prepare data
+  int num_boxes = pc.num_boxes_;
+  const float* boxes_data = pc.boxes_data_ + batch_index * num_boxes * 4;
+  const float* scores_data = pc.scores_data_ + (batch_index * pc.num_classes_ + class_index) * num_boxes;
+  // prepare temporary memory for sorting scores
+  // calculate temporary size that used for sorting
+  size_t cub_sort_temp_storage_bytes = 0;
+  HIP_RETURN_IF_ERROR(hipcub::DeviceRadixSort::SortPairsDescending(
+      nullptr, cub_sort_temp_storage_bytes,
+      static_cast<float*>(nullptr),  // scores
+      static_cast<float*>(nullptr),  // sorted scores
+      static_cast<int*>(nullptr),    // input indices
+      static_cast<int*>(nullptr),    // sorted indices
+      num_boxes,                     // num items
+      0, 8 * sizeof(float),           // sort all bits
+      stream));
+  // allocate temporary memory
+  IAllocatorUniquePtr<void> d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)};
+  auto* d_cub_sort_buffer = static_cast<uint8_t*>(d_cub_sort_buffer_ptr.get());
+  IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_indices = static_cast<int*>(d_indices_ptr.get());
+  IAllocatorUniquePtr<void> d_sorted_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_sorted_indices = static_cast<int*>(d_sorted_indices_ptr.get());
+  IAllocatorUniquePtr<void> d_selected_indices_ptr{allocator(num_boxes * sizeof(int))};
+  auto* d_selected_indices = static_cast<int*>(d_selected_indices_ptr.get());
+  IAllocatorUniquePtr<void> d_sorted_scores_ptr{allocator(num_boxes * sizeof(float))};
+  auto* d_sorted_scores = static_cast<float*>(d_sorted_scores_ptr.get());
+  IAllocatorUniquePtr<void> d_sorted_boxes_ptr{allocator(num_boxes * 4 * sizeof(float))};
+  auto* d_sorted_boxes = static_cast<float*>(d_sorted_boxes_ptr.get());
+  // create sequense of indices
+  int blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(Iota<int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_boxes, 0, d_indices);
+  HIP_RETURN_IF_ERROR(hipGetLastError());
+  // sort scores
+  HIP_RETURN_IF_ERROR(hipcub::DeviceRadixSort::SortPairsDescending(
+      d_cub_sort_buffer,
+      cub_sort_temp_storage_bytes,
+      scores_data,
+      d_sorted_scores,
+      d_indices,
+      d_sorted_indices,
+      num_boxes,
+      0,
+      8 * sizeof(float),  // sort all bits
+      stream));
+  // pick sorted scores
+  const Box* original_boxes = reinterpret_cast<const Box*>(boxes_data);
+  Box* sorted_boxes = reinterpret_cast<Box*>(d_sorted_boxes);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(IndexMultiSelect<int, Box>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_boxes, d_sorted_indices, original_boxes, sorted_boxes);
+  HIP_RETURN_IF_ERROR(hipGetLastError());
+  // STEP 2. filter boxes by scores
+  int limited_num_boxes = num_boxes;
+  if (pc.score_threshold_ != nullptr) {
+    thrust::device_ptr<float> sorted_scores_device_ptr(d_sorted_scores);
+    limited_num_boxes = thrust::count_if(
+        thrust::hip::par.on(stream),
+        sorted_scores_device_ptr,
+        sorted_scores_device_ptr + num_boxes,
+        DeviceGreaterThan(score_threshold));
+    HIP_RETURN_IF_ERROR(hipGetLastError());
+    if (limited_num_boxes == 0) {
+      *h_number_selected = 0;
+      return Status::OK();
+    }
+  }
+  // STEP 3. launch NMS kernels
+  ORT_RETURN_IF_ERROR(NmsGpu(stream,
+                             allocator,
+                             center_point_box,
+                             d_sorted_boxes,
+                             limited_num_boxes,
+                             iou_threshold,
+                             d_selected_indices,
+                             h_number_selected,
+                             max_output_boxes_per_class));
+  HIP_RETURN_IF_ERROR(hipGetLastError());
+  // STEP 4. map back to sorted indices
+  *h_number_selected = std::min(*h_number_selected, max_output_boxes_per_class);
+  int num_to_keep = *h_number_selected;
+  if (num_to_keep > 0) {
+    IAllocatorUniquePtr<void> d_output_indices_ptr{allocator(num_to_keep * sizeof(int))};
+    auto* d_output_indices = static_cast<int*>(d_output_indices_ptr.get());
+    IAllocatorUniquePtr<void> d_normalized_output_indices_ptr{allocator(num_to_keep * 3 * sizeof(int64_t))};
+    auto* d_normalized_output_indices = static_cast<int64_t*>(d_normalized_output_indices_ptr.get());
+    blocksPerGrid = (int)(ceil(static_cast<float>(num_to_keep) / GridDim::maxThreadsPerBlock));
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(IndexMultiSelect<int, int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices);
+    hipLaunchKernelGGL(NormalizeOutput, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index);
+    HIP_RETURN_IF_ERROR(hipGetLastError());
+    selected_indices = std::move(d_normalized_output_indices_ptr);
+  }
+  return Status::OK();
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/non_max_suppression_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include <functional>
+#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
+namespace onnxruntime {
+namespace rocm {
+Status NonMaxSuppressionImpl(
+    hipStream_t stream,
+    std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
+    const PrepareContext& pc,
+    const int64_t center_point_box,
+    int64_t batch_index,
+    int64_t class_index,
+    int max_output_boxes_per_class,
+    float iou_threshold,
+    float score_threshold,
+    IAllocatorUniquePtr<void>& selected_indices,
+    int* h_number_selected);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "roialign.h"
+#include "roialign_impl.h"
+namespace onnxruntime {
+namespace rocm {
+#define REGISTER_KERNEL_TYPED(T)                                         \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                         \
+      RoiAlign,                                                          \
+      kOnnxDomain,                                                       \
+      10,                                                                \
+      T,                                                                 \
+      kRocmExecutionProvider,                                            \
+      (*KernelDefBuilder::Create())                                      \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())         \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()), \
+      RoiAlign<T>);
+template <typename T>
+Status RoiAlign<T>::ComputeInternal(OpKernelContext* context) const {
+  // X
+  const auto* X_ptr = context->Input<Tensor>(0);
+  // rois
+  const auto* rois_ptr = context->Input<Tensor>(1);
+  // batch indices
+  const auto* batch_indices_ptr = context->Input<Tensor>(2);
+  const auto& x_dims = X_ptr->Shape();
+  const auto& rois_dims = rois_ptr->Shape();
+  const auto& batch_indices_dims = batch_indices_ptr->Shape();
+  auto num_rois = batch_indices_dims[0];
+  auto num_roi_cols = rois_dims[1];
+  auto status = CheckROIAlignValidInput(X_ptr, rois_ptr, batch_indices_ptr);
+  if (status != Status::OK()) {
+    return status;
+  }
+  Tensor& Y = *context->Output(0, {num_rois, x_dims[1], this->output_height_, this->output_width_});
+  int64_t output_size = Y.Shape().Size();
+  if (output_size > 0) {
+    RoiAlignImpl(
+        Stream(),
+        output_size,  // num threads
+        reinterpret_cast<const typename ToHipType<T>::MappedType*>(X_ptr->Data<T>()),
+        ToHipType<T>::FromFloat(this->spatial_scale_),
+        x_dims[1],  // num channels
+        x_dims[2],  // height
+        x_dims[3],  // width
+        this->output_height_,
+        this->output_width_,
+        this->sampling_ratio_,
+        reinterpret_cast<const typename ToHipType<T>::MappedType*>(rois_ptr->Data<T>()),
+        num_roi_cols,
+        reinterpret_cast<typename ToHipType<T>::MappedType*>(Y.MutableData<T>()),
+        this->mode_ == RoiAlignMode::avg,
+        this->half_pixel_,
+        batch_indices_ptr->Data<int64_t>());
+  }
+  return Status::OK();
+}
+#define SPECIALIZED_COMPUTE(T) \
+  REGISTER_KERNEL_TYPED(T)     \
+  template Status RoiAlign<T>::ComputeInternal(OpKernelContext* ctx) const;
+SPECIALIZED_COMPUTE(float)
+SPECIALIZED_COMPUTE(double)
+//SPECIALIZED_COMPUTE(MLFloat16)
+}  // namespace rocm
+};  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/object_detection/roialign.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+struct RoiAlign final : RocmKernel, RoiAlignBase {
+  RoiAlign(const OpKernelInfo& info) : RocmKernel(info), RoiAlignBase(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RoiAlign);
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.cu
+#include "hip/hip_runtime.h"
+/**
+* Copyright (c) 2016-present, Facebook, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/* Modifications Copyright (c) Microsoft. */
+#include "roialign_impl.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const bool is_mode_avg,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T val = is_mode_avg
+            ? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4)  // mode Avg
+            : max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4);  // mode Max
+  return val;
+}
+template <typename T>
+__global__ void RoIAlignForward(
+    const int64_t nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio,
+    const T* bottom_rois,
+    int64_t roi_cols,
+    T* top_data,
+    const bool is_mode_avg,
+    const bool half_pixel,
+    const int64_t* batch_indices_ptr) {
+  for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    // RoI could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    const auto roi_batch_ind = batch_indices_ptr[n];
+    // Do not using rounding; this implementation detail is critical
+    T roi_offset = half_pixel ? T(0.5) : T(0);
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!half_pixel) { // backward compatiblity
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    const T* offset_bottom_data =
+        bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) * height * width);
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : _Ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width);
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+    T output_val = 0.;
+    bool max_flag = false;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, is_mode_avg, index);
+        if (is_mode_avg) {
+          output_val += val;
+        } else {
+          if (!max_flag) {
+            output_val = val;
+            max_flag = true;
+          } else {
+            output_val = max(output_val, val);
+          }
+        }
+      }
+    }
+    if (is_mode_avg) {
+      output_val /= count;
+    }
+    top_data[index] = output_val;
+  }
+}
+template <typename T>
+void RoiAlignImpl(
+  hipStream_t stream,
+  const int64_t nthreads,
+  const T* bottom_data,
+  const T spatial_scale,
+  const int64_t channels,
+  const int64_t height,
+  const int64_t width,
+  const int64_t pooled_height,
+  const int64_t pooled_width,
+  const int64_t sampling_ratio,
+  const T* bottom_rois,
+  int64_t roi_cols,
+  T* top_data,
+  const bool is_mode_avg,
+  const bool half_pixel,
+  const int64_t* batch_indices_ptr) {
+    int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock)); 
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RoIAlignForward<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      nthreads,
+      bottom_data,
+      spatial_scale,
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      bottom_rois,
+      roi_cols,
+      top_data,
+      is_mode_avg,
+      half_pixel,
+      batch_indices_ptr);    
+}
+#define SPECIALIZED_IMPL(T)                     \
+  template void RoiAlignImpl<T>(                \
+        hipStream_t stream,              \
+        const int64_t nthreads,                 \
+        const T* bottom_data,                   \
+        const T spatial_scale,                  \
+        const int64_t channels,                 \
+        const int64_t height,                   \
+        const int64_t width,                    \
+        const int64_t pooled_height,            \
+        const int64_t pooled_width,             \
+        const int64_t sampling_ratio,           \
+        const T* bottom_rois,                   \
+        int64_t roi_cols,                       \
+        T* top_data,                            \
+        const bool is_mode_avg,                 \
+        const bool half_pixel,                  \
+        const int64_t* batch_indices_ptr);
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double)
+} // namespace rocm
+} // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+void RoiAlignImpl(
+    hipStream_t stream,
+    const int64_t nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio,
+    const T* bottom_rois,
+    int64_t roi_cols,
+    T* top_data,
+    const bool is_mode_avg,
+    const bool half_pixel,
+    const int64_t* batch_indices_ptr);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/reduction/reduction_functions.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+#include "core/common/optional.h"
+#include "core/framework/tensor_shape.h"
+namespace onnxruntime {
+namespace rocm {
+namespace {
+// gets min and max of single contiguous range of axes if available
+optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes(
+    int64_t rank,
+    const std::vector<int64_t>& dims,
+    const std::vector<int64_t>& original_axes) {
+  assert(rank == static_cast<int64_t>(dims.size()));
+  // empty axes means reduce all dimensions
+  if (original_axes.empty()) {
+    return std::make_pair(int64_t{0}, rank - 1);
+  }
+  // normalize axis values and sort
+  const std::vector<int64_t> axes = [&original_axes, rank]() {
+    std::vector<int64_t> result(original_axes);
+    std::for_each(
+        result.begin(), result.end(),
+        [rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
+    std::sort(result.begin(), result.end());
+    return result;
+  }();
+  assert(!axes.empty());
+  const auto is_dim_one = [](int64_t dim) { return dim == 1; };
+  for (auto a = axes.begin(), b = axes.begin() + 1;
+       b != axes.end();
+       ++a, ++b) {
+    ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
+    // if axis values are adjacent, the axes are contiguous
+    if (*a + 1 == *b) {
+      continue;
+    }
+    // if all dimension values between adjacent axes are 1,
+    // treat the axes as contiguous
+    if (std::all_of(dims.begin() + *a + 1, dims.begin() + *b, is_dim_one)) {
+      continue;
+    }
+    // otherwise, not contiguous
+    return nullopt;
+  }
+  // expand axes over surrounding dimensions with value of 1
+  const int64_t min_axis = [&dims, &axes, &is_dim_one]() -> int64_t {
+    const auto& min_given_axis = axes.front();
+    // note that std::reverse_iterator(it) refers to the element at (it-1)
+    // it -> reverse it: element offset of -1
+    const auto before_min_given_axis_rit =
+        std::make_reverse_iterator(dims.begin() + min_given_axis);
+    const auto before_min_axis_rit =
+        std::find_if_not(before_min_given_axis_rit, dims.rend(), is_dim_one);
+    // reverse it -> it: element offset of +1
+    return std::distance(dims.begin(), before_min_axis_rit.base());
+  }();
+  const int64_t max_axis = [&dims, &axes, &is_dim_one]() {
+    const auto& max_given_axis = axes.back();
+    const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1;
+    const auto after_max_axis_it =
+        std::find_if_not(after_max_given_axis_it, dims.end(), is_dim_one);
+    return std::distance(dims.begin(), after_max_axis_it - 1);
+  }();
+  return std::make_pair(min_axis, max_axis);
+}
+}  // namespace
+ApplicableMatrixReduction get_applicable_matrix_reduction(
+    const miopenReduceTensorOp_t miopen_reduce_op,
+    gsl::span<const int64_t> dims, gsl::span<const int64_t> original_axes,
+    int& m_out, int& n_out) {
+  if (miopen_reduce_op != MIOPEN_REDUCE_TENSOR_ADD && miopen_reduce_op != MIOPEN_REDUCE_TENSOR_AVG) {
+    return ApplicableMatrixReduction::None;
+  }
+  // Remove all dims with value 1. This can help to optimize case like:
+  // dims=[2,3,1,4,1,5] and axes=[0,2,4], which is same as dims=[2,3,4,5] and axes=[0].
+  std::vector<int64_t> new_dims;
+  std::vector<int64_t> new_axes;
+  const auto original_rank = gsl::narrow<int64_t>(dims.size());
+  std::set<int64_t> original_axes_set;
+  for (const auto axis : original_axes) {
+    original_axes_set.insert(HandleNegativeAxis(axis, original_rank));
+  }
+  int64_t new_axis = 0;
+  for (size_t i = 0; i < dims.size(); i++) {
+    if (dims[i] != 1) {
+      new_dims.emplace_back(dims[i]);
+      if (original_axes_set.find(gsl::narrow<int64_t>(i)) != original_axes_set.end()) {
+        new_axes.emplace_back(new_axis);
+      }
+      new_axis++;
+    }
+  }
+  // Empty axes means reduce all dimensions, which has different meaning,
+  // so add a new dim to the end if all original axes are on dims with value 1.
+  if (!original_axes.empty() && new_axes.empty()) {
+    new_dims.emplace_back(1);
+    new_axes.emplace_back(new_axis);
+  }
+  // If all dims are value 1, make sure it's not empty by adding a new dim.
+  if (!dims.empty() && new_dims.empty()) {
+    new_dims.emplace_back(1);
+  }
+  const auto rank = gsl::narrow<int64_t>(new_dims.size());
+  const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, new_dims, new_axes);
+  if (!min_and_max_axes.has_value()) {
+    return ApplicableMatrixReduction::None;
+  }
+  const auto& min_axis = min_and_max_axes->first;
+  const auto& max_axis = min_and_max_axes->second;
+  // axes from beginning means row reduction, axes to end means column reduction
+  // for axes from beginning to end, either works and we do row reduction
+  const bool axes_from_beginning = min_axis == 0;
+  const bool axes_to_end = max_axis == rank - 1;
+  // handle axes anchored to beginning or end
+  if (!axes_from_beginning && !axes_to_end) {
+    return ApplicableMatrixReduction::None;
+  }
+  // the axis index right after the last flattened into matrix rows
+  const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis;
+  const auto shape=TensorShape::FromExistingBuffer(new_dims);
+  const auto m = shape.SizeToDimension(m_end_axis);
+  const auto n = shape.SizeFromDimension(m_end_axis);
+  ORT_ENFORCE(m > 0 && n > 0, "shape must not have negative dimensions: ", shape);
+  if (m > std::numeric_limits<int>::max() ||
+      n > std::numeric_limits<int>::max()) {
+    return ApplicableMatrixReduction::None;
+  }
+  m_out = gsl::narrow_cast<int>(m);
+  n_out = gsl::narrow_cast<int>(n);
+  return axes_from_beginning
+             ? ApplicableMatrixReduction::Rows
+             : ApplicableMatrixReduction::Columns;
+}
+}  // namespace rocm
+}  // namespace onnxruntime