add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cuh
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "quantize_linear.h"
+#include "core/providers/rocm/rocm_common.h"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+namespace onnxruntime {
+namespace rocm {
+template <class T, class U>
+Status CudaQuantizeLinear(hipStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element);
+template <class T, class U>
+Status CudaDequantizeLinear(hipStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+namespace onnxruntime {
+namespace rocm {
+template <class T, class U = float>
+class QuantizeLinear final : public RocmKernel {
+ public:
+  QuantizeLinear(const OpKernelInfo& info) : RocmKernel(info) {}
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
+};
+template <class T, class U = float>
+class DequantizeLinear final : public RocmKernel {
+ public:
+  DequantizeLinear(const OpKernelInfo& info) : RocmKernel(info) {}
+  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "reshape.h"
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    14,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
+        .Alias(0, 0)
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Reshape);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    13, 13,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
+        .Alias(0, 0)
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Reshape);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    5, 12,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
+        .Alias(0, 0)
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Reshape);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Reshape,
+    kOnnxDomain,
+    1,
+    4,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Reshape_1);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/tensor/reshape_helper.h"
+namespace onnxruntime {
+namespace rocm {
+class Reshape final : public RocmKernel {
+ public:
+  Reshape(const OpKernelInfo& info) : RocmKernel(info),
+                                      allow_zero_(info.GetAttrOrDefault("allowzero", static_cast<int64_t>(0)) == 1) {
+  }
+  Status ComputeInternal(OpKernelContext* context) const override {
+    // Copy the second input tensor into the shape vector
+    const Tensor* shapeTensor = context->Input<Tensor>(1);
+    if (shapeTensor == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+    if (shapeTensor->Shape().NumDimensions() != 1) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "A shape tensor must be a vector tensor, got ", shapeTensor->Shape().NumDimensions(), " dimensions");
+    auto data_span = shapeTensor->template DataAsSpan<int64_t>();
+    TensorShapeVector shape(data_span.begin(), data_span.end());
+    const Tensor* X = context->Input<Tensor>(0);
+    if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+    const TensorShape& X_shape = X->Shape();
+    ReshapeHelper helper(X_shape, shape, allow_zero_);
+    Tensor* Y = context->Output(0, TensorShape(shape));
+    const void* source = X->DataRaw();
+    void* target = Y->MutableDataRaw();
+    //If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+    if (target != source) {
+      ORT_RETURN_IF_ERROR(CopyTensor(*X, *Y));
+    }
+    return Status::OK();
+  }
+  private:
+   bool allow_zero_;
+};
+class Reshape_1 final : public RocmKernel {
+ public:
+  Reshape_1(const OpKernelInfo& info) : RocmKernel(info) {
+    Status status = info.GetAttrs("shape", shape_);
+    ORT_ENFORCE(status.IsOK(), "Attribute shape is not set.");
+  }
+  Status ComputeInternal(OpKernelContext* context) const override {
+    TensorShapeVector shape = shape_;
+    const Tensor* X = context->Input<Tensor>(0);
+    const TensorShape& X_shape = X->Shape();
+    ReshapeHelper helper(X_shape, shape);
+    Tensor* Y = context->Output(0, TensorShape(shape));
+    const void* source = X->DataRaw();
+    void* target = Y->MutableDataRaw();
+    //If source and target pointers are not equal (non-inplace operation), we need to copy the data.
+    if (target != source) {
+      ORT_RETURN_IF_ERROR(CopyTensor(*X, *Y));
+    }
+    return Status::OK();
+  }
+ private:
+  TensorShapeVector shape_;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "resize.h"
+namespace onnxruntime {
+namespace rocm {
+#define REGISTER_KERNEL_TYPED(T)                                   \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      Resize,                                                      \
+      kOnnxDomain,                                                 \
+      10, 10,                                                      \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                  \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),  \
+      Resize<T>);                                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
+      Resize,                                                      \
+      kOnnxDomain,                                                 \
+      11, 12,                                                      \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
+      Resize<T>);                                                  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
+      Resize,                                                      \
+      kOnnxDomain,                                                 \
+      13,                                                          \
+      T,                                                           \
+      kRocmExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                  \
+          .InputMemoryType(OrtMemTypeCPUInput, 3)                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
+      Resize<T>);
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(double)
+REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(int32_t)
+REGISTER_KERNEL_TYPED(uint8_t)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/tensor/upsample.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+class Resize : public Upsample<T> {
+ public:
+  Resize(const OpKernelInfo& info) : Upsample<T>(info) {
+  }
+  Status ComputeInternal(OpKernelContext* context) const override {
+    return Upsample<T>::ComputeInternal(context);
+  }
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.cu
+#include "hip/hip_runtime.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/tensor/resize_impl.h"
+namespace onnxruntime {
+namespace rocm {
+using onnxruntime::ResizeCoordinateTransformationMode;
+using onnxruntime::ResizeNearestMode;
+using onnxruntime::UpsampleMode;
+struct NearestPixel_SIMPLE {
+  __device__ __forceinline__ int operator() (float x_original, bool is_down_sampling) const {
+    if (is_down_sampling) {
+      return static_cast<int>(_Ceil(x_original));
+    }
+    return static_cast<int>(x_original);
+  }
+};
+struct NearestPixel_ROUND_PREFER_FLOOR {
+  __device__ __forceinline__ int operator() (float x_original, bool) const {
+    if (x_original == static_cast<int>(x_original) + 0.5f) {
+      return static_cast<int>(_Floor(x_original));
+    }
+    return static_cast<int>(roundf(x_original));
+  }
+};
+struct NearestPixel_ROUND_PREFER_CEIL {
+  __device__ __forceinline__ int operator() (float x_original, bool) const {
+    return static_cast<int>(roundf(x_original));
+  }
+};
+struct NearestPixel_FLOOR {
+  __device__ __forceinline__ int operator() (float x_original, bool) const {
+    return static_cast<int>(_Floor(x_original));
+  }
+};
+struct NearestPixel_CEIL {
+  __device__ __forceinline__ int operator() (float x_original, bool) const {
+    return static_cast<int>(_Ceil(x_original));
+  }
+};
+struct TransformCoordinate_ASYMMETRIC {
+  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
+    return x_resized / x_scale;
+  }
+};
+struct TransformCoordinate_HALF_PIXEL {
+  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
+    return ((x_resized + 0.5f) / x_scale) - 0.5f;
+  }
+};
+struct TransformCoordinate_PYTORCH_HALF_PIXEL {
+  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float length_resized, float, float, float) const {
+    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
+  }
+};
+struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
+  __device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
+    return (x_resized + 0.5f) / x_scale;
+  }
+};
+struct TransformCoordinate_ALIGN_CORNERS {
+  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float, float) const {
+    return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
+  }
+};
+struct TransformCoordinate_TF_CROP_AND_RESIZE {
+  __device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float roi_start, float roi_end) const {
+    auto orig = length_resized > 1
+      ? roi_start * (length_original - 1) + (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
+      : 0.5 * (roi_start + roi_end) * (length_original - 1);
+    return static_cast<float>(orig);
+  }
+};
+#define CASE_TYPE_USING_HINT(enum_type, type, HINT, ...) \
+  case enum_type: {                                      \
+    using HINT = type;                                   \
+    return __VA_ARGS__();                                \
+  }
+#define CASE_TYPE_COORD(enum_type, type, ...) \
+  CASE_TYPE_USING_HINT(enum_type, type, coord_t, __VA_ARGS__)
+#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...)                                                                      \
+  [&] {                                                                                                                                \
+    const auto& the_type = TYPE;                                                                                                       \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                                                            \
+    switch (the_type) {                                                                                                                \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL,           TransformCoordinate_HALF_PIXEL, __VA_ARGS__)           \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC,           TransformCoordinate_ASYMMETRIC, __VA_ARGS__)           \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL,   TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__)   \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS,        TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__)        \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__) \
+      CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE,   TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__)   \
+      default:                                                                                                                         \
+        ORT_THROW("unknown ResizeCoordinateTransformationMode");                                                                       \
+    }                                                                                                                                  \
+  }()
+#define CASE_TYPE_NEAREST(enum_type, type, ...) \
+  CASE_TYPE_USING_HINT(enum_type, type, nearest_t, __VA_ARGS__)
+#define DISPATCH_RESIZE_NEAREST_MODE(TYPE, ...)                                                              \
+  [&] {                                                                                                      \
+    const auto& the_type = TYPE;                                                                             \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                                  \
+    switch (the_type) {                                                                                      \
+      CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE,             NearestPixel_SIMPLE, __VA_ARGS__)             \
+      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_FLOOR, NearestPixel_ROUND_PREFER_FLOOR, __VA_ARGS__) \
+      CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL,  NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__)  \
+      CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR,              NearestPixel_FLOOR, __VA_ARGS__)              \
+      CASE_TYPE_NEAREST(ResizeNearestMode::CEIL,               NearestPixel_CEIL, __VA_ARGS__)               \
+      default:                                                                                               \
+        ORT_THROW("unknown ResizeNearestMode");                                                              \
+    }                                                                                                        \
+  }()
+struct NearestMappingInfo {
+  int origin_;
+  int extrapolate_;
+};
+template <typename T, typename CudaFunctionOriginalCoordinate, typename CudaFunctionNearestPixel>
+__global__ void _ResizeNearestMappingKernel2D(
+    const int input_height, const int input_width,
+    const int output_height, const int output_width,
+    const float scales_height, const float scales_width,
+    const float roi_start_height, const float roi_end_height,
+    const float roi_start_width, const float roi_end_width,
+    const bool extrapolation_enabled,
+    const CudaFunctionOriginalCoordinate& transform_coordinate,
+    const CudaFunctionNearestPixel& calc_nearest_pixel,
+    NearestMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, output_height + output_width);
+  if (id >= 0 && id < output_height) {  // for Height
+    int dim = id;
+    // only apply co-ordinate transformation if scale != 1.0
+    if (scales_height == 1.0f) {
+        dims_mapping[id].extrapolate_ = 0;
+    } else {
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
+                                              static_cast<float>(input_height), roi_start_height, roi_end_height);
+      dims_mapping[id].extrapolate_ = static_cast<int>(
+          extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
+      dim = calc_nearest_pixel(orig_coord, scales_height < 1);
+      if (dim >= input_height) dim = input_height - 1;
+      if (dim < 0) dim = 0;    
+    }
+    dims_mapping[id].origin_ = dim;
+  } else {
+    int dim = id - output_height;
+    // only apply co-ordinate transformation if scale != 1.0
+    if (scales_width == 1.0f) {
+      dims_mapping[id].extrapolate_ = 0;
+    } else {
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_width, static_cast<float>(output_width),
+                                              static_cast<float>(input_width), roi_start_width, roi_end_width);
+      dims_mapping[id].extrapolate_ = static_cast<int>(
+          extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_width - 1)));
+      dim = calc_nearest_pixel(orig_coord, scales_width < 1);
+      if (dim >= input_width) dim = input_width - 1;
+      if (dim < 0) dim = 0; 
+    }
+    dims_mapping[id].origin_ = dim;
+    return;
+  }
+}
+template <typename T, typename CudaFunctionOriginalCoordinate, typename CudaFunctionNearestPixel>
+__global__ void _ResizeNearestMappingKernel(
+    const size_t rank,
+    const TArray<int64_t> input_shape,
+    const TArray<int64_t> output_shape,
+    const TArray<float> scales,
+    const TArray<float, 10> roi,
+    const size_t total_dim_sum,
+    bool extrapolation_enabled,
+    const CudaFunctionOriginalCoordinate& transform_coordinate,
+    const CudaFunctionNearestPixel& calc_nearest_pixel,
+    int64_t* prefix_dim_sum,
+    NearestMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, total_dim_sum);
+  int64_t dim_sum = 0;
+  for (int axis = 0; axis < rank; ++axis) {
+    if (id == dim_sum) {
+      prefix_dim_sum[axis] = dim_sum;
+    }
+    if (id >= dim_sum && id < dim_sum + output_shape[axis]) {
+      int dim = id - dim_sum;
+      // only apply co-ordinate transformation if scale != 1.0
+      if (scales[axis] == 1.0f) {
+        dims_mapping[id].extrapolate_ = 0;
+      } else {
+        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
+                                                static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
+        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
+        dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
+        if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
+        if (dim < 0) dim = 0;      
+      }
+      dims_mapping[id].origin_ = dim;
+      return;
+    }
+    dim_sum += output_shape[axis];
+  }
+}
+template <typename T, bool UseExtrapolation>
+__global__ void _ResizeNearestKernel2D(
+    const int64_t output_height, const int64_t output_width,
+    const int64_t input_stride_image, const int input_stride_row,
+    const fast_divmod output_stride_image, const fast_divmod output_stride_row,
+    const T* input_data, T* output_data, const size_t N,
+    const T extrapolation_value, const NearestMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int imageid, h, w, output_index;
+  output_stride_image.divmod(static_cast<int>(id), imageid, output_index);
+  output_stride_row.divmod(output_index, h, w);
+  if (UseExtrapolation) {
+    if (dims_mapping[h].extrapolate_ + dims_mapping[output_height + w].extrapolate_) {
+      output_data[id] = extrapolation_value;
+      return;
+    }
+  }
+  int input_index = input_stride_image * imageid +
+                    input_stride_row * dims_mapping[h].origin_ +
+                    dims_mapping[output_height + w].origin_;
+  output_data[id] = input_data[input_index];
+}
+template <typename T>
+__global__ void _ResizeNearestKernel(
+    const int rank,
+    const TArray<int64_t> input_strides,
+    const TArray<fast_divmod> output_div_pitches,
+    const T* input_data,
+    T* output_data,
+    const size_t N,
+    const T extrapolation_value,
+    const int64_t* prefix_dim_sum,
+    const NearestMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int output_index = static_cast<int>(id);
+  int input_index = 0;
+  int extrapolation_occured = 0;
+  for (int axis = 0; axis < rank; ++axis) {
+    int dim = 0;
+    output_div_pitches[axis].divmod(output_index, dim, output_index);
+    const NearestMappingInfo& mi = dims_mapping[prefix_dim_sum[axis] + dim];
+    extrapolation_occured += mi.extrapolate_;
+    input_index += input_strides[axis] * mi.origin_;
+  }
+  output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
+}
+struct LinearMappingInfo {
+  int origin_;
+  float weight_;
+  int extrapolate_;
+};
+template <typename T, typename CudaFunctionOriginalCoordinate>
+__global__ void _ResizeBilinearCoordinateMapping(
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    float scale_height, float scale_width,
+    float roi_height_start, float roi_height_end,
+    float roi_width_start, float roi_width_end,
+    const size_t SumHW, bool extrapolation_enabled,
+    const CudaFunctionOriginalCoordinate& transform_coordinate,
+    LinearMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
+  if (id < output_height) {  //  y = id
+    float input_y = scale_height == 1 ? static_cast<float>(id) :
+                                        transform_coordinate(static_cast<float>(id), scale_height,
+                                        static_cast<float>(output_height), static_cast<float>(input_height),
+                                        roi_height_start, roi_height_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
+    int y_int = static_cast<int>(input_y);
+    dims_mapping[id].origin_ = y_int;
+    dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
+  } else {  //x = id - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_height) :
+                                       transform_coordinate(static_cast<float>(id - output_height), scale_width,
+                                       static_cast<float>(output_width), static_cast<float>(input_width),
+                                       roi_width_start, roi_width_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+    input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
+    int x_int = static_cast<int>(input_x);
+    dims_mapping[id].origin_ = x_int;
+    dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int;
+  }
+}
+// The following method supports a 2-D or 4-D input in 'Linear mode'. Last two dimension is [H, W].
+// the scale values for the outer dimensions except last two are 1.
+template <typename T>
+__global__ void _ResizeBilinearKernel(
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    fast_divmod div_output_width, fast_divmod div_output_image,
+    const T* input_data, T* output_data, const size_t N,
+    const T extrapolation_value,
+    LinearMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+  HIP_LONG input_index = bxc * input_height * input_width;
+  int output_y, output_x;
+  div_output_width.divmod(output_image_index, output_y, output_x);
+  if (dims_mapping[output_y].extrapolate_ || dims_mapping[output_x + output_height].extrapolate_) {
+    output_data[id] = extrapolation_value;
+    return;
+  }
+  float y_offset_0 = dims_mapping[output_y].weight_;
+  int y_int = dims_mapping[output_y].origin_;
+  float x_offset_0 = dims_mapping[output_x + output_height].weight_;
+  int x_int = dims_mapping[output_x + output_height].origin_;
+  input_index += y_int * input_width + x_int;
+  T x00 = input_data[input_index];
+  bool end_of_h = (y_int >= input_height - 1);
+  bool end_of_w = (x_int >= input_width - 1);
+  T x10 = end_of_w ? x00 : input_data[input_index + 1];
+  T x01 = end_of_h ? x00 : input_data[input_index + input_width];
+  T x11 = end_of_w ? x01 : (end_of_h ? x10 : input_data[input_index + input_width + 1]);
+  float y_offset_1 = 1.0f - y_offset_0;
+  float x_offset_1 = 1.0f - x_offset_0;
+  output_data[id] =
+      x00 * static_cast<T>(y_offset_1 * x_offset_1) +
+      x01 * static_cast<T>(y_offset_0 * x_offset_1) +
+      x10 * static_cast<T>(y_offset_1 * x_offset_0) +
+      x11 * static_cast<T>(y_offset_0 * x_offset_0);
+}
+template <typename T, typename CudaFunctionOriginalCoordinate>
+__global__ void _ResizeTrilinearCoordinateMapping(
+    int64_t input_depth, int64_t input_height, int64_t input_width,
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    float scale_depth, float scale_height, float scale_width,
+    float roi_depth_start, float roi_depth_end,
+    float roi_height_start, float roi_height_end,
+    float roi_width_start, float roi_width_end,
+    const size_t SumDHW, bool extrapolation_enabled,
+    const CudaFunctionOriginalCoordinate& transform_coordinate,
+    LinearMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
+  if (id < output_depth) {  //  z = id
+    float input_z = scale_depth == 1 ? static_cast<float>(id)  :
+                                       transform_coordinate(static_cast<float>(id), scale_depth,
+                                       static_cast<float>(output_depth), static_cast<float>(input_depth),
+                                       roi_depth_start, roi_depth_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
+    input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
+    int z_int = static_cast<int>(input_z);
+    dims_mapping[id].origin_ = z_int;
+    dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
+  } else if (id >= output_depth && id < (output_depth + output_height)) {  //  y = id - output_depth
+    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) : 
+                                        transform_coordinate(static_cast<float>(id - output_depth), scale_height, 
+                                        static_cast<float>(output_height), static_cast<float>(input_height), 
+                                        roi_height_start, roi_height_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
+    int y_int = static_cast<int>(input_y);
+    dims_mapping[id].origin_ = y_int;
+    dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
+  } else {  //x = id - output_depth - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
+                                       transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
+                                       static_cast<float>(output_width), static_cast<float>(input_width),
+                                       roi_width_start, roi_width_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+    input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
+    int x_int = static_cast<int>(input_x);
+    dims_mapping[id].origin_ = x_int;
+    dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int;
+  }
+}
+// The following method supports a 3-D or 5-D input in 'Linear mode'. Last two dimension is [D, sH, W].
+// the scale values for the outer dimensions except last two are 1.
+template <typename T>
+__global__ void _ResizeTrilinearKernel(
+    int64_t input_depth, int64_t input_height, int64_t input_width,
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image,
+    const T* input_data, T* output_data, const size_t N,
+    const T extrapolation_value,
+    LinearMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+  HIP_LONG input_index = bxc * input_depth * input_height * input_width;
+  int output_z, output_y, output_x, temp;
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+  if (dims_mapping[output_z].extrapolate_ || 
+      dims_mapping[output_y + output_depth].extrapolate_ || 
+      dims_mapping[output_x + output_depth + output_height].extrapolate_) {
+    output_data[id] = extrapolation_value;
+    return;
+  }
+  float z_offset_0 = dims_mapping[output_z].weight_;
+  int z_int = dims_mapping[output_z].origin_;
+  float y_offset_0 = dims_mapping[output_y + output_depth].weight_;
+  int y_int = dims_mapping[output_y + output_depth].origin_;
+  float x_offset_0 = dims_mapping[output_x + output_depth + output_height].weight_;
+  int x_int = dims_mapping[output_x + output_depth + output_height].origin_;
+  input_index += z_int * input_height * input_width + y_int * input_width + x_int;
+  T x000 = input_data[input_index];
+  bool end_of_h = (y_int >= input_height - 1);
+  bool end_of_w = (x_int >= input_width - 1);
+  T x100 = end_of_w ? x000 : input_data[input_index + 1];
+  T x010 = end_of_h ? x000 : input_data[input_index + input_width];
+  T x110 = end_of_w ? x010 : (end_of_h ? x100 : input_data[input_index + input_width + 1]);
+  bool end_of_d = (z_int >= input_depth - 1);
+  if (!end_of_d) {
+    input_index = input_index + input_height * input_width;
+  }
+  T x001 = end_of_d ? x000 : input_data[input_index];
+  T x101 = end_of_w ? x001 : input_data[input_index + 1];
+  T x011 = end_of_h ? x001 : input_data[input_index + input_width];
+  T x111 = end_of_w ? x011 : (end_of_h ? x101 : input_data[input_index + input_width + 1]);
+  float z_offset_1 = 1.0f - z_offset_0;
+  float y_offset_1 = 1.0f - y_offset_0;
+  float x_offset_1 = 1.0f - x_offset_0;
+  output_data[id] =
+      x000 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_1) +
+      x010 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_1) +
+      x100 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_0) +
+      x110 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_0) +
+      x001 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_1) +
+      x011 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_1) +
+      x101 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_0) +
+      x111 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_0);
+}
+template <typename T>
+__device__ __forceinline__ float CubicInterpolationRowwise(
+    const T* image, int x, int y, int input_height, int input_width,
+    float coeff0, float coeff1, float coeff2, float coeff3) {
+  int row_index = max(0, min(y, input_height - 1)) * input_width;
+  return coeff0 * static_cast<float>(image[row_index + max(0, min(x - 1, input_width - 1))]) +
+         coeff1 * static_cast<float>(image[row_index + max(0, min(x, input_width - 1))]) +
+         coeff2 * static_cast<float>(image[row_index + max(0, min(x + 1, input_width - 1))]) +
+         coeff3 * static_cast<float>(image[row_index + max(0, min(x + 2, input_width - 1))]);
+}
+struct CubicMappingInfo {
+  int origin_;
+  int extrapolate_;
+  float coeff0_;
+  float coeff1_;
+  float coeff2_;
+  float coeff3_;
+};
+template <typename T, typename CudaFunctionOriginalCoordinate>
+__global__ void _ResizeCubicCoordinateMapping(
+    int64_t input_height, int64_t input_width,
+    int64_t output_height, int64_t output_width,
+    float scale_height, float scale_width,
+    float roi_height_start, float roi_height_end,
+    float roi_width_start, float roi_width_end,
+    const size_t SumHW, bool extrapolation_enabled,
+    float cubic_coeff_a, bool exclude_outside,
+    const CudaFunctionOriginalCoordinate& transform_coordinate,
+    CubicMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
+  auto& dm = dims_mapping[id];
+  bool is_y_axis = (id < output_height);
+  int max_input_coord = static_cast<int>(is_y_axis ? input_height : input_width);
+  float scale = is_y_axis ? scale_height : scale_width;
+  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height) :
+      transform_coordinate(
+      static_cast<float>(is_y_axis ? id : id - output_height),
+      scale,
+      static_cast<float>(is_y_axis ? output_height : output_width),
+      static_cast<float>(max_input_coord),
+      (is_y_axis ? roi_height_start : roi_width_start),
+      (is_y_axis ? roi_height_end : roi_width_end));
+  int coord_int = static_cast<int>(_Floor(input_coordinat));
+  float s_coord = abs(input_coordinat - coord_int);
+  float coeff_sum = 1.0f;
+  float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) * (s_coord + 1) + 8 * cubic_coeff_a) * (s_coord + 1) - 4 * cubic_coeff_a);
+  float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) * s_coord * s_coord + 1);
+  float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) * (1 - s_coord) * (1 - s_coord) + 1);
+  float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) * (2 - s_coord) + 8 * cubic_coeff_a) * (2 - s_coord) - 4 * cubic_coeff_a);
+  if (exclude_outside) {
+    coeff_0 = (coord_int - 1 < 0 || coord_int - 1 >= max_input_coord) ? 0.0 : coeff_0;
+    coeff_1 = (coord_int + 0 < 0 || coord_int + 0 >= max_input_coord) ? 0.0 : coeff_1;
+    coeff_2 = (coord_int + 1 < 0 || coord_int + 1 >= max_input_coord) ? 0.0 : coeff_2;
+    coeff_3 = (coord_int + 2 < 0 || coord_int + 2 >= max_input_coord) ? 0.0 : coeff_3;
+    coeff_sum = coeff_0 + coeff_1 + coeff_2 + coeff_3;
+  }
+  dm.origin_ = coord_int;
+  dm.coeff0_ = coeff_0 / coeff_sum;
+  dm.coeff1_ = coeff_1 / coeff_sum;
+  dm.coeff2_ = coeff_2 / coeff_sum;
+  dm.coeff3_ = coeff_3 / coeff_sum;
+  dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 || input_coordinat > static_cast<float>(max_input_coord - 1)));
+}
+template <typename T>
+__global__ void _ResizeBiCubicKernel(
+    int64_t input_height, int64_t input_width, int64_t output_height, int64_t output_width,
+    fast_divmod div_output_width, fast_divmod div_output_image,
+    const T* input_data, T* output_data, const size_t N, const T extrapolation_value,
+    CubicMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int bxc, output_image_index, output_x, output_y;
+  div_output_image.divmod(id, bxc, output_image_index);
+  HIP_LONG input_index = bxc * input_height * input_width;
+  div_output_width.divmod(output_image_index, output_y, output_x);
+  CubicMappingInfo& y_info = dims_mapping[output_y];
+  CubicMappingInfo& x_info = dims_mapping[output_x + output_height];
+  if (y_info.extrapolate_ || x_info.extrapolate_) {
+    output_data[id] = extrapolation_value;
+    return;
+  }
+  float w0 = x_info.coeff0_;
+  float w1 = x_info.coeff1_;
+  float w2 = x_info.coeff2_;
+  float w3 = x_info.coeff3_;
+  int x_int = x_info.origin_;
+  int y_int = y_info.origin_;
+  const T* image = input_data + input_index;
+  output_data[id] = y_info.coeff0_ * CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff1_ * CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff2_ * CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
+                    y_info.coeff3_ * CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
+}
+size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
+                            const gsl::span<const int64_t>& output_dims) {
+  switch (upsample_mode) {
+    case UpsampleMode::NN:
+      return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), (int64_t)0));
+    case UpsampleMode::LINEAR:
+      return sizeof(LinearMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+    case UpsampleMode::CUBIC:
+      return sizeof(CubicMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
+  }
+  return 0;
+}
+template <typename T>
+void ResizeNearestImpl(
+    hipStream_t stream,
+    const int rank,
+    TArray<int64_t>& input_shape,
+    TArray<int64_t>& output_shape,
+    TArray<int64_t>& input_strides,
+    TArray<fast_divmod>& output_div_pitches,
+    TArray<float>& scales_vals,
+    TArray<float, 10>& roi_vals,
+    const T* input_data,
+    T* output_data,
+    const size_t N,
+    bool extrapolation_enabled,
+    const T extrapolation_value,
+    float cubic_coeff_a,
+    ResizeCoordinateTransformationMode transform_coordinate,
+    ResizeNearestMode calc_nearest_pixel,
+    int64_t* /* prefix_dim_sum */,
+    NearestMappingInfo* dims_mapping) {
+  unsigned int blocksPerGrid = static_cast<unsigned int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  bool could2d = rank >= 2 &&
+                 transform_coordinate != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE &&
+                 std::all_of(scales_vals.Data(), scales_vals.Data() + (rank - 2), [](float v) { return v == 1.0; });
+  if (could2d) {
+    int64_t output_height = output_shape[rank - 2];
+    int64_t output_width = output_shape[rank - 1];
+    fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast<int>(output_height * output_width));
+    int blocksPerDimsMappingGrid = static_cast<int>(ceil((output_height + output_width) / 32.0));
+    DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
+      DISPATCH_RESIZE_NEAREST_MODE(calc_nearest_pixel, [&]() {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestMappingKernel2D<T>), blocksPerDimsMappingGrid, 32, 0, stream, 
+            static_cast<int>(input_shape[rank - 2]), static_cast<int>(input_shape[rank - 1]),
+            static_cast<int>(output_height), static_cast<int>(output_width),
+            scales_vals[rank - 2], scales_vals[rank - 1],
+            roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+            roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+            extrapolation_enabled, coord_t(), nearest_t(),
+            dims_mapping);
+      });
+    });
+    if (extrapolation_enabled) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestKernel2D<T, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          output_height, output_width,
+          input_shape[rank - 2] * input_shape[rank - 1], static_cast<int>(input_shape[rank - 1]),
+          div_output_image, output_div_pitches[rank - 2],
+          input_data, output_data, N,
+          extrapolation_value,
+          dims_mapping);
+    } else {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestKernel2D<T, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          output_height, output_width,
+          input_shape[rank - 2] * input_shape[rank - 1], static_cast<int>(input_shape[rank - 1]),
+          div_output_image, output_div_pitches[rank - 2],
+          input_data, output_data, N,
+          extrapolation_value,
+          dims_mapping);
+    }
+    return;
+  }
+  int64_t total_dim_sum = std::accumulate(output_shape.Data(), output_shape.Data() + rank, (int64_t)0);
+  int blocksPerDimsMappingGrid = (int)(ceil(static_cast<double>(total_dim_sum) / 32));
+  DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
+    DISPATCH_RESIZE_NEAREST_MODE(calc_nearest_pixel, [&]() {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestMappingKernel<T>), blocksPerDimsMappingGrid, 32, 0, stream, 
+          rank, input_shape, output_shape,
+          scales_vals, roi_vals,
+          total_dim_sum, extrapolation_enabled,
+          coord_t(), nearest_t(),
+          reinterpret_cast<int64_t*>(dims_mapping),
+          reinterpret_cast<NearestMappingInfo*>(reinterpret_cast<int64_t*>(dims_mapping) + rank));
+    });
+  });
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      rank, input_strides, output_div_pitches,
+      input_data, output_data, N,
+      extrapolation_value,
+      reinterpret_cast<const int64_t*>(dims_mapping),
+      reinterpret_cast<const NearestMappingInfo*>(reinterpret_cast<int64_t*>(dims_mapping) + rank));
+  return;
+}
+template <typename T>
+void ResizeImpl(
+    hipStream_t stream,
+    const UpsampleMode upsample_mode,
+    const int rank,
+    TArray<int64_t>& input_shape,
+    TArray<int64_t>& output_shape,
+    TArray<int64_t>& input_strides,
+    TArray<fast_divmod>& output_div_pitches,
+    TArray<float>& scales_vals,
+    TArray<float, 10>& roi_vals,
+    const T* input_data,
+    T* output_data,
+    const size_t N,
+    bool extrapolation_enabled,
+    const T extrapolation_value,
+    float cubic_coeff_a,
+    bool exclude_outside,
+    ResizeCoordinateTransformationMode coordinate_transform_mode,
+    ResizeNearestMode nearest_mode,
+    void* dims_mapping) {
+  bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) &&
+                (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
+  if (isSame) {
+    HIP_CALL_THROW(hipMemcpyAsync(output_data, input_data, N * sizeof(T), hipMemcpyDeviceToDevice, stream));
+    return;
+  }
+  if (upsample_mode == UpsampleMode::NN) {
+    ResizeNearestImpl(
+        stream, rank, input_shape, output_shape, input_strides, output_div_pitches,
+        scales_vals, roi_vals, input_data, output_data, N,
+        extrapolation_enabled, extrapolation_value, cubic_coeff_a,
+        coordinate_transform_mode, nearest_mode,
+        reinterpret_cast<int64_t*>(dims_mapping),
+        reinterpret_cast<NearestMappingInfo*>(reinterpret_cast<int64_t*>(dims_mapping) + rank));
+    return;
+  }
+  // We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  bool is_2D = (rank == 2 || rank == 4);
+  // We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  bool is_3D = (rank == 3 || rank == 5);
+  // Should not hit this as we have already validated input rank/scales and we provide verbose error messages
+  // to the user.
+  ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize");
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  fast_divmod div_output_image;
+  if (is_2D) {
+    div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast<int>(N));
+  } else if (is_3D) {
+    div_output_image = (rank > 3) ? output_div_pitches[rank - 4] : fast_divmod(gsl::narrow_cast<int>(N));
+  }
+  int64_t output_depth = is_3D ? output_shape[rank - 3] : 0;
+  int64_t output_height = output_shape[rank - 2];
+  int64_t output_width = output_shape[rank - 1];
+  int blocksPerDimsMappingGrid =
+      static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
+  switch (upsample_mode) {
+    case UpsampleMode::LINEAR:
+      if (is_2D) {
+        DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
+          hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeBilinearCoordinateMapping<T>), blocksPerDimsMappingGrid, 32, 0, stream, 
+              input_shape[rank - 2], input_shape[rank - 1],
+              output_height, output_width,
+              scales_vals[rank - 2], scales_vals[rank - 1],
+              roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+              roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+              output_height + output_width, extrapolation_enabled, coord_t(),
+              reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        });
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeBilinearKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            input_shape[rank - 2], input_shape[rank - 1],
+            output_height, output_width,
+            output_div_pitches[rank - 2], div_output_image,
+            input_data, output_data, N, extrapolation_value,
+            reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        return;
+      } else if (is_3D) {
+        DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
+          hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeTrilinearCoordinateMapping<T>), blocksPerDimsMappingGrid, 32, 0, stream, 
+              input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
+              output_depth, output_height, output_width,
+              scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
+              roi_vals[rank - 3], roi_vals[rank - 3 + rank],
+              roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+              roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+              output_depth + output_height + output_width, extrapolation_enabled, coord_t(),
+              reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        });
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeTrilinearKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
+            output_depth, output_height, output_width,
+            output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image,
+            input_data, output_data, N, extrapolation_value,
+            reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        return;
+      }
+      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+      break;
+    case UpsampleMode::CUBIC:
+      if (is_2D) {
+        DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
+          hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeCubicCoordinateMapping<T>), blocksPerDimsMappingGrid, 32, 0, stream, 
+              input_shape[rank - 2], input_shape[rank - 1],
+              output_height, output_width,
+              scales_vals[rank - 2], scales_vals[rank - 1],
+              roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+              roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+              output_height + output_width, extrapolation_enabled,
+              cubic_coeff_a, exclude_outside, coord_t(),
+              reinterpret_cast<CubicMappingInfo*>(dims_mapping));
+        });
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeBiCubicKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            input_shape[rank - 2], input_shape[rank - 1],
+            output_height, output_width,
+            output_div_pitches[rank - 2], div_output_image,
+            input_data, output_data, N, extrapolation_value,
+            reinterpret_cast<CubicMappingInfo*>(dims_mapping));
+        return;
+      }
+      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+    case UpsampleMode::NN:
+      ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
+  }
+}
+#define SPECIALIZED_IMPL(T)                                         \
+  template void ResizeImpl<T>(                                      \
+      hipStream_t stream,                                    \
+      const UpsampleMode upsample_mode,                             \
+      const int rank,                                               \
+      TArray<int64_t>& input_shape,                                 \
+      TArray<int64_t>& output_shape,                                \
+      TArray<int64_t>& input_strides,                               \
+      TArray<fast_divmod>& output_div_pitches,                      \
+      TArray<float>& scales_vals,                                   \
+      TArray<float, 10>& roi_vals,                                  \
+      const T* input_data,                                          \
+      T* output_data,                                               \
+      const size_t N,                                               \
+      bool extrapolation_enabled,                                   \
+      const T extrapolation_value,                                  \
+      float cubic_coeff_a,                                          \
+      bool exclude_outside,                                         \
+      ResizeCoordinateTransformationMode coordinate_transform_mode, \
+      ResizeNearestMode nearest_mode,                               \
+      void* dims_mapping);
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double)
+SPECIALIZED_IMPL(half)
+SPECIALIZED_IMPL(int32_t)
+SPECIALIZED_IMPL(uint8_t)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/common/common.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
+#include "core/providers/rocm/rocm_common.h"
+namespace onnxruntime {
+namespace rocm {
+size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
+                            const gsl::span<const int64_t>& output_dims);
+template <typename T>
+void ResizeImpl(
+    hipStream_t stream,
+    const onnxruntime::UpsampleMode upsample_mode,
+    const int rank,
+    TArray<int64_t>& input_shape,
+    TArray<int64_t>& output_shape,
+    TArray<int64_t>& input_strides,
+    TArray<fast_divmod>& output_div_pitches,
+    TArray<float>& scales_vals,
+    TArray<float, 10>& roi,
+    const T* input_data,
+    T* output_data,
+    const size_t N,
+    bool extrapolation_enabled,
+    const T extrapolation_value,
+    float cubic_coeff_a,
+    bool exclude_outside,
+    onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode,
+    onnxruntime::ResizeNearestMode nearest_mode,
+    void* dims_mapping);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "reverse_sequence.h"
+#include "reverse_sequence_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_KERNEL_EX(
+    ReverseSequence,
+    kOnnxDomain,
+    10,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    ReverseSequenceOp);
+#define ReverseSequenceCallCudaImplTypeAs(T, TEqual)                                                 \
+  if (X.IsDataType<T>()) {                                                                           \
+    HIP_RETURN_IF_ERROR(ReverseSequenceCudaImpl(                                                    \
+        Stream(),                                                                                    \
+        reinterpret_cast<const typename ToHipType<TEqual>::MappedType*>(X.Data<T>()),      \
+        seq_lengths.Data<int64_t>(),                                                                 \
+        reinterpret_cast<typename ToHipType<TEqual>::MappedType*>(Y.MutableData<T>()),     \
+        gsl::narrow<int>(batch_size), gsl::narrow<int>(max_seq_len), gsl::narrow<int>(element_size), \
+        time_major_));                                                                               \
+    return Status::OK();                                                                             \
+  }
+Status ReverseSequenceOp::ComputeInternal(OpKernelContext* context) const {
+  const auto& X = *context->Input<Tensor>(0);
+  const auto& dims = X.Shape();
+  const auto batch_size = time_major_ ? dims[1] : dims[0];
+  const auto max_seq_len = time_major_ ? dims[0] : dims[1];
+  const auto element_size = dims.SizeFromDimension(2);
+  const auto& seq_lengths = *context->Input<Tensor>(1);
+  const auto& seq_len_shape = seq_lengths.Shape();
+  if (seq_len_shape.NumDimensions() != 1 || seq_len_shape[0] != batch_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "sequence_lens shape must be {batch_size}. Got:",
+                           seq_len_shape, ". batch_size=", batch_size);
+  }
+  auto& Y = *context->Output(0, dims);
+  ReverseSequenceCallCudaImplTypeAs(float, int32_t);
+  ReverseSequenceCallCudaImplTypeAs(int32_t, int32_t);
+  ReverseSequenceCallCudaImplTypeAs(uint32_t, int32_t);
+  ReverseSequenceCallCudaImplTypeAs(MLFloat16, int16_t);
+  ReverseSequenceCallCudaImplTypeAs(int16_t, int16_t);
+  ReverseSequenceCallCudaImplTypeAs(uint16_t, int16_t);
+  ReverseSequenceCallCudaImplTypeAs(int8_t, int8_t);
+  ReverseSequenceCallCudaImplTypeAs(uint8_t, int8_t);
+  ReverseSequenceCallCudaImplTypeAs(bool, int8_t);
+  ReverseSequenceCallCudaImplTypeAs(int64_t, int64_t);
+  ReverseSequenceCallCudaImplTypeAs(double, int64_t);
+  ReverseSequenceCallCudaImplTypeAs(uint64_t, int64_t);
+  return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                         "Type for ", X.DataType(), " is not supported yet in ReverseSequence.");
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+namespace onnxruntime {
+namespace rocm {
+class ReverseSequenceOp final : public RocmKernel {
+ public:
+  ReverseSequenceOp(const OpKernelInfo& info) : RocmKernel(info) {
+    int64_t batch_axis;
+    int64_t time_axis;
+    ORT_ENFORCE(info.GetAttr<int64_t>("batch_axis", &batch_axis).IsOK());
+    ORT_ENFORCE(info.GetAttr<int64_t>("time_axis", &time_axis).IsOK());
+    ORT_ENFORCE(batch_axis < 2, "Invalid batch_axis of ", batch_axis, ". Must be 0 or 1");
+    ORT_ENFORCE(time_axis < 2, "Invalid time_axis of ", time_axis, ". Must be 0 or 1");
+    ORT_ENFORCE(batch_axis != time_axis,
+                "time_axis and batch_axis must have different values but both are ", time_axis);
+    time_major_ = time_axis == 0;
+  }
+  Status ComputeInternal(OpKernelContext* context) const override;
+ private:
+  bool time_major_;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "reverse_sequence_impl.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/common/common.h"
+namespace onnxruntime {
+namespace rocm {
+static const int kReverseSequenceElementsPerThread = 4;
+template <typename T, bool time_major>
+__global__ void ReverseSequenceImplKernel(
+    const T* x_data,
+    const int64_t* seq_len_data,
+    T* y_data,
+    const int batch_size,
+    const int max_seq_len,
+    const int element_size,
+    const int group_count,
+    const fast_divmod fdm_grouped_stride_0,
+    const fast_divmod fdm_grouped_stride_1) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(grouped_index, group_count);
+  int batch_id, seq_id, gid = grouped_index;
+  if (time_major) {
+    fdm_grouped_stride_0.divmod(gid, seq_id, gid);
+    fdm_grouped_stride_1.divmod(gid, batch_id, gid);
+  } else {
+    fdm_grouped_stride_0.divmod(gid, batch_id, gid);
+    fdm_grouped_stride_1.divmod(gid, seq_id, gid);
+  }
+  int eid = gid * kReverseSequenceElementsPerThread;
+  int target_seq_id = (seq_id < (int)seq_len_data[batch_id]) ? ((int)seq_len_data[batch_id] - 1 - seq_id) : seq_id;
+  int flat_src_idx, flat_target_idx;
+  if (time_major) {
+    flat_src_idx = seq_id * batch_size * element_size + batch_id * element_size + eid;
+    flat_target_idx = target_seq_id * batch_size * element_size + batch_id * element_size + eid;
+  } else {
+    flat_src_idx = batch_id * max_seq_len * element_size + seq_id * element_size + eid;
+    flat_target_idx = batch_id * max_seq_len * element_size + target_seq_id * element_size + eid;
+  }
+  y_data[flat_target_idx] = x_data[flat_src_idx];
+#pragma unroll
+  for (int i = 1; i < kReverseSequenceElementsPerThread; ++i) {
+    if (eid + i < element_size) {
+      y_data[flat_target_idx + i] = x_data[flat_src_idx + i];
+    }
+  }
+}
+template <typename T>
+hipError_t ReverseSequenceCudaImpl(
+    hipStream_t stream,
+    const T* x_data,
+    const int64_t* seq_len_data,
+    T* y_data,
+    const int batch_size,
+    const int max_seq_len,
+    const int element_size,
+    const bool time_major) {
+  int element_group_size = CeilDiv(element_size, kReverseSequenceElementsPerThread);
+  fast_divmod fdm_grouped_stride_1(element_group_size);
+  fast_divmod fdm_grouped_stride_0(element_group_size * ((time_major) ? batch_size : max_seq_len));
+  int group_count = batch_size * max_seq_len * element_group_size;
+  int blocksPerGrid = CeilDiv(group_count, GridDim::maxThreadsPerBlock);
+  if (time_major) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(ReverseSequenceImplKernel<T, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+        x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size,
+        group_count, fdm_grouped_stride_0, fdm_grouped_stride_1);
+  } else {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(ReverseSequenceImplKernel<T, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+        x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size,
+        group_count, fdm_grouped_stride_0, fdm_grouped_stride_1);
+  }
+  return hipSuccess;
+}
+#define InstantiateReverseSequenceImpl(T)       \
+  template hipError_t ReverseSequenceCudaImpl( \
+      hipStream_t stream,                \
+      const T* x_data,                          \
+      const int64_t* seq_len_data,              \
+      T* y_data,                                \
+      const int batch_size,                     \
+      const int max_seq_len,                    \
+      const int element_size,                   \
+      const bool time_major)
+InstantiateReverseSequenceImpl(int64_t);
+InstantiateReverseSequenceImpl(int32_t);
+InstantiateReverseSequenceImpl(int16_t);
+InstantiateReverseSequenceImpl(int8_t);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+hipError_t ReverseSequenceCudaImpl(
+    hipStream_t stream,
+    const T* x_data,
+    const int64_t* seq_len_data,
+    T* y_data,
+    const int batch_size,
+    const int max_seq_len,
+    const int element_size,
+    const bool time_major);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/rocm/tensor/scatter_elements.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/providers/rocm/tensor/gather_elements.h"
+#include "core/providers/rocm/tensor/gather_elements_impl.h"
+#include "core/providers/rocm/tensor/scatter_elements_impl.h"
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Scatter, kOnnxDomain, 9, 10, kRocmExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                                      .TypeConstraint("Tind",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                                              DataTypeImpl::GetTensorType<int64_t>()}),
+                                  ScatterElements);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterElements, kOnnxDomain, 11, 12, kRocmExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                                      .TypeConstraint("Tind",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                                              DataTypeImpl::GetTensorType<int64_t>()}),
+                                  ScatterElements);
+ONNX_OPERATOR_KERNEL_EX(ScatterElements, kOnnxDomain, 13, kRocmExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                            .TypeConstraint("Tind", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
+                                                                            DataTypeImpl::GetTensorType<int64_t>()}),
+                        ScatterElements);
+#define CASE_SCATTER_ELEMENTS_IMPL(type)                                                                         \
+  case sizeof(type): {                                                                                           \
+    const type* indices_data = reinterpret_cast<const type*>(indices_data_raw);                                  \
+    ORT_RETURN_IF_ERROR(ScatterElementsImpl(stream, input_data, indices_data, updates_data, output_data, args)); \
+  } break
+template <typename T>
+struct ScatterElements::ComputeImpl {
+  Status operator()(hipStream_t stream, const void* input_data_raw, const void* updates_data_raw,
+                    const void* indices_data_raw, void* output_data_raw, const size_t index_element_size,
+                    const GatherScatterElementsArgs& args) const {
+    typedef typename ToHipType<T>::MappedType HipT;
+    const HipT* input_data = reinterpret_cast<const HipT*>(input_data_raw);
+    const HipT* updates_data = reinterpret_cast<const HipT*>(updates_data_raw);
+    HipT* output_data = reinterpret_cast<HipT*>(output_data_raw);
+    switch (index_element_size) {
+      CASE_SCATTER_ELEMENTS_IMPL(int32_t);
+      CASE_SCATTER_ELEMENTS_IMPL(int64_t);
+      // should not reach here as we validate if the all relevant types are supported in the Compute method
+      default:
+        ORT_THROW("Unsupported indices element size by the ScatterElements ROCM kernel");
+    }
+    return Status::OK();
+  }
+};
+#undef CASE_SCATTER_ELEMENTS_IMPL
+Status ScatterElements::ComputeInternal(OpKernelContext* context) const {
+  const auto* input_tensor = context->Input<Tensor>(0);
+  const auto& input_shape = input_tensor->Shape();
+  const int64_t input_size = input_shape.Size();
+  const int64_t input_rank = static_cast<int64_t>(input_shape.NumDimensions());
+  const int64_t axis = HandleNegativeAxis(axis_, input_rank);
+  const auto* indices_tensor = context->Input<Tensor>(1);
+  const auto* updates_tensor = context->Input<Tensor>(2);
+  if (input_tensor->DataType() != updates_tensor->DataType()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "data type is different from updates type");
+  }
+  const auto& indices_shape = indices_tensor->Shape();
+  auto indices_dims = indices_shape.GetDims();
+  auto updates_dims = updates_tensor->Shape().GetDims();
+  if (indices_dims.size() != updates_dims.size()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Indices and updates must have the same rank");
+  }
+  for (size_t i = 0; i < indices_dims.size(); ++i) {
+    if (indices_dims[i] != updates_dims[i]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Indices vs updates dimensions differs at position=", i,
+                             " ", indices_dims[i], " vs ", updates_dims[i]);
+    }
+  }
+  // Validate input shapes and ranks (invoke the static method in the CPU GatherElements kernel that hosts the shared
+  // checks)
+  ORT_RETURN_IF_ERROR(onnxruntime::GatherElements::ValidateInputShapes(input_shape, indices_shape, axis));
+  auto* output_tensor = context->Output(0, input_shape);
+  if (input_size == 0) return Status::OK();
+  GatherScatterElementsArgs args;
+  args.input_size = input_size;
+  args.indices_size = indices_shape.Size();
+  TensorShapeVector input_shape_vec = input_shape.AsShapeVector();
+  TensorShapeVector indices_shape_vec = indices_shape.AsShapeVector();
+  CoalesceDimensions(input_shape_vec, indices_shape_vec, nullptr, axis, args);
+  // Use element size instead of concrete types so we can specialize less template functions to reduce binary size.
+  int dtype = GetElementType(input_tensor->DataType()->Size());
+  if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+    ORT_THROW("Unsupported element size by the ScatterElements ROCM kernel");
+  }
+  utils::MLTypeCallDispatcher<int8_t, MLFloat16, float, double> t_disp(dtype);
+  return t_disp.InvokeRet<Status, ComputeImpl>(Stream(), input_tensor->DataRaw(), updates_tensor->DataRaw(),
+                                               indices_tensor->DataRaw(), output_tensor->MutableDataRaw(),
+                                               indices_tensor->DataType()->Size(), args);
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+namespace onnxruntime {
+namespace rocm {
+class ScatterElements final : public RocmKernel {
+ public:
+  ScatterElements(const OpKernelInfo& info) : RocmKernel(info) {
+    ORT_ENFORCE(info.GetAttr<int64_t>("axis", &axis_).IsOK(),
+                "Missing/Invalid 'axis' attribute value");
+  }
+  ~ScatterElements() = default;
+  Status ComputeInternal(OpKernelContext* context) const override;
+ private:
+  template <typename T>
+  struct ComputeImpl;
+  int64_t axis_;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+namespace onnxruntime {
+namespace rocm {
+struct GatherScatterElementsArgs;
+template <typename T, typename TIndex>
+Status ScatterElementsImpl(hipStream_t stream, const T* input_data, const TIndex* indices_data, const T* updates_data,
+                           T* output_data, const GatherScatterElementsArgs& args);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/rocm/tensor/scatter_nd.h"
+#include "core/providers/rocm/tensor/scatter_nd_impl.h"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/providers/cpu/tensor/utils.h"
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterND,
+                                  kOnnxDomain,
+                                  11, 12,
+                                  kRocmExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                                      .MayInplace(0, 0),
+                                  ScatterND);
+ONNX_OPERATOR_KERNEL_EX(ScatterND,
+                        kOnnxDomain,
+                        13,
+                        kRocmExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                            .MayInplace(0, 0),
+                        ScatterND);
+Status ScatterND::ComputeInternal(OpKernelContext* context) const {
+  const auto* input_tensor = context->Input<Tensor>(0);
+  const auto* indices_tensor = context->Input<Tensor>(1);
+  const auto* updates_tensor = context->Input<Tensor>(2);
+  const auto& input_shape = input_tensor->Shape();
+  const auto& indices_shape = indices_tensor->Shape();
+  const auto& updates_shape = updates_tensor->Shape();
+  // Validate input shapes
+  ORT_RETURN_IF_ERROR(onnxruntime::ScatterND::ValidateShapes(input_shape, indices_shape, updates_shape));
+  auto* output_tensor = context->Output(0, input_shape);
+  const void* input_data = input_tensor->DataRaw();
+  void* output_data = output_tensor->MutableDataRaw();
+  size_t element_size = input_tensor->DataType()->Size();
+  if (input_data != output_data) {
+    // TODO: Run benchmarks to determine if a dedicated kernel doing data copy will be faster than invoking hipMemcpy ?
+    HIP_RETURN_IF_ERROR(
+      hipMemcpyAsync(output_data, input_data, input_tensor->SizeInBytes(), hipMemcpyDeviceToDevice, Stream()));
+  }
+  // Bail out early
+  if (indices_shape.Size() == 0) {
+    return Status::OK();
+  }
+  auto last_index_dimension = indices_shape[indices_shape.NumDimensions() - 1];
+  // We need element counts for each dimension and the input dim value for each dimension
+  // for the range [0, last_index_dimension).
+  // To avoid multiple GPU data transfers, we combine this into one array and send it through
+  TensorPitches input_strides(input_shape);
+  std::vector<int64_t> element_counts_and_input_dims(last_index_dimension * 2, 0LL);
+  for (int64_t i = 0; i < last_index_dimension; ++i) {
+    element_counts_and_input_dims[i] = input_strides[i];
+    element_counts_and_input_dims[i + last_index_dimension] = input_shape[i];
+  }
+  RocmAsyncBuffer<int64_t> element_counts_and_input_dims_gpu(this, element_counts_and_input_dims);
+  ORT_RETURN_IF_ERROR(element_counts_and_input_dims_gpu.CopyToGpu());
+  ORT_RETURN_IF_ERROR(ScatterNDImpl(
+      Stream(),
+      output_data,
+      element_size,
+      indices_shape.Size() / static_cast<size_t>(last_index_dimension),
+      indices_tensor->Data<int64_t>(),  // only int64_t is supported for indices as per the onnx spec
+      last_index_dimension,
+      element_counts_and_input_dims_gpu.GpuPtr(),
+      updates_tensor->DataRaw(),
+      input_shape.SizeFromDimension(last_index_dimension)));
+  return Status::OK();
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/tensor/scatter_nd.h"
+namespace onnxruntime {
+namespace rocm {
+class ScatterND final : public RocmKernel {
+ public:
+  explicit ScatterND(const OpKernelInfo& info) : RocmKernel(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/providers/rocm/tensor/scatter_nd_impl.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/atomic/common.cuh"
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+__global__ void _ScatterNDKernel(
+    T* output_data,
+    const size_t num_indices,
+    const int64_t* indices_data,
+    const int64_t last_index_dimension,
+    const int64_t* element_counts_and_input_dims,
+    const T* updates_data,
+    const size_t num_updates_elements) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, num_indices);
+  // Compute the base offset into the output data
+  int64_t data_offset = 0;
+  size_t indices_start = last_index_dimension * id;
+  size_t indices_end = indices_start + last_index_dimension;
+  for (size_t i = indices_start; i < indices_end; ++i) {
+    int64_t index = indices_data[i];
+    int64_t element_count_dim = element_counts_and_input_dims[i - indices_start];
+    int64_t dim_value = element_counts_and_input_dims[i - indices_start + last_index_dimension];
+    // Clamp the index if out of range
+    // This would have been an error in the CPU kernel, but throwing in the ROCM EP
+    // is hard. This is the approach taken by other frameworks for out of bound indices
+    // in their corresponding GPU backends as well.
+    // index >= -dim_value && index < dim_value
+    if (index >= 0) {
+      if (index >= dim_value) {
+        index = dim_value - 1;
+      }
+    } else {
+      if (index < -dim_value) {
+        index = 0;
+      } else {
+        index += dim_value;
+      }
+    }
+    data_offset += (index * element_count_dim);
+  }
+  const T* updates_data_base = updates_data + num_updates_elements * id;
+  T* output_data_base = output_data + data_offset;
+  for (size_t i = 0; i < num_updates_elements; ++i) {
+    output_data_base[i] = updates_data_base[i];
+  }
+}
+Status ScatterNDImpl(
+    hipStream_t stream,
+    void* output_data,
+    const size_t element_size,
+    const size_t num_indices,
+    const int64_t* indices_data,
+    const int64_t last_index_dimension,
+    const int64_t* element_counts_and_input_dims,
+    const void* updates_data,
+    const size_t num_updates_elements) {
+  if (num_indices == 0)
+    return Status::OK();
+  // Parallelize on number of indices
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(num_indices) / GridDim::maxThreadsPerBlock));
+  switch (element_size) {
+    case sizeof(int8_t):
+      hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          reinterpret_cast<int8_t*>(output_data),
+          num_indices,
+          indices_data,
+          last_index_dimension,
+          element_counts_and_input_dims,
+          reinterpret_cast<const int8_t*>(updates_data),
+          num_updates_elements);
+      break;
+    case sizeof(int16_t):
+      hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          reinterpret_cast<int16_t*>(output_data),
+          num_indices,
+          indices_data,
+          last_index_dimension,
+          element_counts_and_input_dims,
+          reinterpret_cast<const int16_t*>(updates_data),
+          num_updates_elements);
+      break;
+    case sizeof(int32_t):
+      hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          reinterpret_cast<int32_t*>(output_data),
+          num_indices,
+          indices_data,
+          last_index_dimension,
+          element_counts_and_input_dims,
+          reinterpret_cast<const int32_t*>(updates_data),
+          num_updates_elements);
+      break;
+    case sizeof(int64_t):
+      hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          reinterpret_cast<int64_t*>(output_data),
+          num_indices,
+          indices_data,
+          last_index_dimension,
+          element_counts_and_input_dims,
+          reinterpret_cast<const int64_t*>(updates_data),
+          num_updates_elements);
+      break;
+    default:
+      // Shouldn't hit this
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for ScatterND operator");
+  }
+  return Status::OK();
+}
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+namespace onnxruntime {
+namespace rocm {
+Status ScatterNDImpl(
+    hipStream_t stream,
+    void* output_data,
+    const size_t element_size,
+    const size_t num_indices,
+    const int64_t* indices_data,
+    const int64_t last_index_dimension,
+    const int64_t* element_counts_and_input_dims,
+    const void* updates_data,
+    const size_t num_updates_elements);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "sequence_op.h"
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_KERNEL_EX(
+    SequenceAt,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("I", std::vector<MLDataType>{
+                                 DataTypeImpl::GetTensorType<int32_t>(),
+                                 DataTypeImpl::GetTensorType<int64_t>()}),
+    SequenceAt);
+ONNX_OPERATOR_KERNEL_EX(
+    SequenceConstruct,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes()),
+    SequenceConstruct);
+ONNX_OPERATOR_KERNEL_EX(
+    SequenceEmpty,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes()),
+    SequenceEmpty);
+ONNX_OPERATOR_KERNEL_EX(
+    SequenceLength,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .OutputMemoryType(OrtMemTypeCPUInput, 0)
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
+        .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),
+    SequenceLength);
+ONNX_OPERATOR_KERNEL_EX(
+    ConcatFromSequence,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes()),
+    ConcatFromSequence);
+ONNX_OPERATOR_KERNEL_EX(
+    SequenceErase,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
+        .TypeConstraint("I", std::vector<MLDataType>{
+                                 DataTypeImpl::GetTensorType<int32_t>(),
+                                 DataTypeImpl::GetTensorType<int64_t>()}),
+    SequenceErase);
+ONNX_OPERATOR_KERNEL_EX(
+    SequenceInsert,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
+        .TypeConstraint("I", std::vector<MLDataType>{
+                                 DataTypeImpl::GetTensorType<int32_t>(),
+                                 DataTypeImpl::GetTensorType<int64_t>()}),
+    SequenceInsert);
+}  // namespace rocm
+}  // namespace onnxruntime