add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+void GatherImpl(
+    hipStream_t stream,
+    const int64_t input_block_size,
+    const int64_t indices_max,
+    const fast_divmod& output_block_size,
+    const fast_divmod& block_size,
+    const void* indices_data,
+    size_t index_element_size,
+    const void* input_data,
+    size_t element_size,
+    void* output_data,
+    const size_t N);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/tensor/gather_nd.h"
+#include "core/providers/rocm/tensor/gather_nd_impl.h"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+Status CheckBatchDimensionsMatch(
+    size_t num_batch_dimensions,
+    const std::vector<std::reference_wrapper<TensorShape>>& tensor_shapes) {
+  for (size_t tensor_shape_idx = 0; tensor_shape_idx < tensor_shapes.size(); ++tensor_shape_idx) {
+    const TensorShape& tensor_shape = tensor_shapes[tensor_shape_idx];
+    ORT_RETURN_IF_NOT(
+        num_batch_dimensions <= tensor_shape.NumDimensions(),
+        "Number of batch dimensions exceeds tensor rank. ",
+        "Batch dimension count: ", num_batch_dimensions,
+        ", tensor rank: ", tensor_shape.NumDimensions(),
+        ", tensor index: ", tensor_shape_idx);
+  }
+
+  if (tensor_shapes.empty()) return Status::OK();
+
+  const TensorShape& first_tensor_shape = tensor_shapes.front();
+  for (size_t batch_dimension_idx = 0; batch_dimension_idx < num_batch_dimensions; ++batch_dimension_idx) {
+    for (size_t tensor_shape_idx = 1; tensor_shape_idx < tensor_shapes.size(); ++tensor_shape_idx) {
+      const TensorShape& other_tensor_shape = tensor_shapes[tensor_shape_idx];
+      ORT_RETURN_IF_NOT(
+          first_tensor_shape[batch_dimension_idx] == other_tensor_shape[batch_dimension_idx],
+          "Batch dimensions differ at index ", batch_dimension_idx, ": ",
+          first_tensor_shape[batch_dimension_idx], " != ", other_tensor_shape[batch_dimension_idx],
+          ", tensor indices: 0, ", tensor_shape_idx);
+    }
+  }
+
+  return Status::OK();
+}
+
+template <typename TIndex>
+Status GatherNDBase::PrepareCompute(
+    hipStream_t stream,
+    const int64_t batch_dims,
+    const TensorShape& input_shape,
+    const TensorShape& indices_shape,
+    const Tensor* indices_tensor,
+    int64_t& num_slices,
+    int64_t& slice_size,
+    IAllocatorUniquePtr<int64_t>& input_slice_offsets_buffer) const {
+  const auto num_slice_dims = indices_shape[indices_shape.NumDimensions() - 1];
+  num_slices = indices_shape.SizeToDimension(indices_shape.NumDimensions() - 1);
+  slice_size = input_shape.SizeFromDimension(batch_dims + num_slice_dims);
+  const auto num_batches = input_shape.SizeToDimension(batch_dims);
+  const auto input_batch_stride = input_shape.SizeFromDimension(batch_dims);
+  const auto num_slices_per_batch = num_slices / num_batches;
+
+  const TIndex* const indices_data = indices_tensor->Data<TIndex>();
+
+  std::vector<int64_t> sizes_from_slice_dims(num_slice_dims);
+  {
+    auto running_product = slice_size;
+    for (int64_t i = 0; i < num_slice_dims; ++i) {
+      sizes_from_slice_dims[num_slice_dims - 1 - i] = running_product;
+      running_product *= input_shape[batch_dims + num_slice_dims - 1 - i];
+    }
+  }
+
+  auto sizes_from_slice_dims_buffer = GetScratchBuffer<int64_t>(sizes_from_slice_dims.size());
+  HIP_RETURN_IF_ERROR(hipMemcpyAsync(
+      sizes_from_slice_dims_buffer.get(),
+      sizes_from_slice_dims.data(),
+      sizes_from_slice_dims.size() * sizeof(int64_t),
+      hipMemcpyHostToDevice, stream));
+
+  input_slice_offsets_buffer = GetScratchBuffer<int64_t>(num_slices);
+
+  TArray<int64_t> input_dims(input_shape.GetDims());
+
+  ComputeSliceOffsetsImpl(
+      stream,
+      batch_dims,
+      input_dims,
+      num_slices,
+      num_slices_per_batch,
+      input_batch_stride,
+      num_slice_dims,
+      sizes_from_slice_dims_buffer.get(),
+      indices_data,
+      input_slice_offsets_buffer.get());
+
+  return Status::OK();
+}
+
+#define REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(TIndex, startver, endver)  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                   \
+      GatherND,                                                              \
+      kOnnxDomain,                                                           \
+      startver,                                                              \
+      endver,                                                                \
+      TIndex,                                                                \
+      kRocmExecutionProvider,                                                \
+      (*KernelDefBuilder::Create())                                          \
+          .TypeConstraint("T",                                               \
+                          std::vector<MLDataType>{                           \
+                              DataTypeImpl::GetTensorType<float>(),          \
+                              DataTypeImpl::GetTensorType<double>(),         \
+                              DataTypeImpl::GetTensorType<MLFloat16>(),      \
+                              DataTypeImpl::GetTensorType<int64_t>(),        \
+                              DataTypeImpl::GetTensorType<bool>(),           \
+                          })                                                 \
+          .TypeConstraint("indices", DataTypeImpl::GetTensorType<TIndex>()), \
+      GatherND<TIndex>);
+
+#define REGISTER_KERNEL_TYPED_GATHER_ND(TIndex, ver)                                                           \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                               \
+      GatherND, kOnnxDomain, ver, TIndex, kRocmExecutionProvider,                                              \
+      (*KernelDefBuilder::Create())                                                                            \
+          .TypeConstraint("T", BuildKernelDefConstraints<float, MLFloat16, double, int64_t, BFloat16, bool>()) \
+          .TypeConstraint("indices", DataTypeImpl::GetTensorType<TIndex>()),                                   \
+      GatherND<TIndex>);
+
+REGISTER_KERNEL_TYPED_GATHER_ND(int64_t, 13)
+REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(int64_t, 12, 12)
+REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(int64_t, 11, 11)
+
+template <typename T>
+struct GatherNDComputeImpl {
+  void operator()(hipStream_t stream,
+                  const int64_t num_slices,
+                  const int64_t slice_size,
+                  const void* const kernel_input_data,
+                  void* const kernel_output_data,
+                  int64_t* const input_slice_offsets_data) const {
+    typedef typename ToHipType<T>::MappedType HipT;
+    GatherNDImpl<HipT>(stream,
+                        num_slices, kernel_input_data,
+                        kernel_output_data, slice_size,
+                        input_slice_offsets_data);
+  }
+};
+
+template <typename TIndex>
+Status GatherND<TIndex>::ComputeInternal(OpKernelContext* context) const {
+  auto input_tensor = context->Input<Tensor>(0);
+  auto indices_tensor = context->Input<Tensor>(1);
+  ORT_RETURN_IF_NOT(input_tensor != nullptr, "input_tensor == nullptr");
+  ORT_RETURN_IF_NOT(indices_tensor != nullptr, "indices_tensor == nullptr");
+
+  auto input_shape = input_tensor->Shape();
+  auto indices_shape = indices_tensor->Shape();
+
+  if (indices_shape.NumDimensions() == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "indices tensor must has rank larger than 0");
+  }
+
+  auto last_indices_dimension = batch_dims_ + indices_shape[indices_shape.NumDimensions() - 1];
+  if (last_indices_dimension > static_cast<int64_t>(input_shape.NumDimensions())) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "last dimension of indices must not be larger than rank of input tensor");
+  }
+
+  ORT_RETURN_IF_ERROR(CheckBatchDimensionsMatch(
+      static_cast<size_t>(batch_dims_), {input_shape, indices_shape}));
+
+  // Output shape
+  std::vector<int64_t> shape(indices_shape.GetDims().begin(), indices_shape.GetDims().end() - 1);
+  shape.insert(shape.end(), input_shape.GetDims().begin() + last_indices_dimension, input_shape.GetDims().end());
+
+  auto output_tensor = context->Output(0, TensorShape(shape));
+
+  // Bail out early in case the output is going to be empty
+  if (output_tensor->Shape().Size() == 0) {
+    return Status::OK();
+  }
+
+  // Compute
+  int64_t num_slices;
+  int64_t slice_size;
+  IAllocatorUniquePtr<int64_t> input_slice_offsets_buffer;
+  ORT_RETURN_IF_ERROR(PrepareCompute<TIndex>(Stream(),
+                                             batch_dims_, input_shape, indices_shape, indices_tensor,
+                                             num_slices, slice_size, input_slice_offsets_buffer));
+
+  const void* const kernel_input_data = input_tensor->DataRaw();
+  void* const kernel_output_data = output_tensor->MutableDataRaw();
+  utils::MLTypeCallDispatcher<float, MLFloat16, double, int64_t, BFloat16, bool> t_disp(input_tensor->GetElementType());
+  t_disp.Invoke<GatherNDComputeImpl>(Stream(), num_slices, slice_size, kernel_input_data, kernel_output_data,
+                                     input_slice_offsets_buffer.get());
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+Status CheckBatchDimensionsMatch(
+    size_t num_batch_dimensions,
+    const std::vector<std::reference_wrapper<TensorShape>>& tensor_shapes);
+
+class GatherNDBase : public RocmKernel {
+ public:
+  GatherNDBase(const OpKernelInfo& info) : RocmKernel(info) {
+    info.GetAttrOrDefault("batch_dims", &batch_dims_, static_cast<int64_t>(0));
+    ORT_ENFORCE(batch_dims_ >= 0);
+  }
+
+ protected:
+  template <typename TIndex>
+  Status PrepareCompute(
+      hipStream_t stream,
+      const int64_t batch_dims,
+      const TensorShape& input_shape,
+      const TensorShape& indices_shape,
+      const Tensor* indices_tensor,
+      int64_t& num_slices,
+      int64_t& slice_size,
+      IAllocatorUniquePtr<int64_t>& input_slice_offsets_buffer) const;
+
+  int64_t batch_dims_;
+};
+
+template <typename Tind>
+class GatherND final : public GatherNDBase {
+ public:
+  GatherND(const OpKernelInfo& info) : GatherNDBase(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/tensor/gather_nd_impl.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/atomic/common.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename TIndex>
+__global__ void _ComputeSliceOffsetsKernel(
+    const int64_t batch_dims,
+    const TArray<int64_t> input_dims,
+    const size_t num_slices,
+    const size_t num_slices_per_batch,
+    const size_t input_batch_stride,
+    const size_t num_slice_dims,
+    const int64_t* const sizes_from_slice_dims_data,  // num_slice_dims elements
+    const TIndex* const indices_data,                 // num_slices * num_slice_dims elements
+    int64_t* const input_slice_offsets_data) {        // num_slices elements
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(slice_idx, num_slices)
+
+  const size_t batch_idx = slice_idx / num_slices_per_batch;
+  const size_t base_offset = batch_idx * input_batch_stride;
+
+  const TIndex* const slice_indices = indices_data + slice_idx * num_slice_dims;
+  size_t relative_slice_offset = 0;
+  for (size_t dim_idx = 0; dim_idx < num_slice_dims; ++dim_idx) {
+    int64_t index = static_cast<int64_t>(slice_indices[dim_idx]);
+    const size_t input_dim_idx = batch_dims + dim_idx;
+    HIP_KERNEL_ASSERT(index >= -input_dims[input_dim_idx] && index < input_dims[input_dim_idx]);
+    if (index < 0) index += input_dims[input_dim_idx];
+
+    relative_slice_offset += index * sizes_from_slice_dims_data[dim_idx];
+  }
+
+  input_slice_offsets_data[slice_idx] = base_offset + relative_slice_offset;
+}
+
+template <typename T>
+__global__ void _GatherNDKernel(
+    const size_t num_slices,
+    const T* input_data,
+    T* output_data,
+    const size_t slice_size,
+    const int64_t* slice_offsets) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size)
+  uint64_t slice_offset = slice_offsets[i / slice_size];
+  output_data[i] = input_data[slice_offset + i % slice_size];
+};
+
+template <typename TIndex>
+void ComputeSliceOffsetsImpl(
+    hipStream_t stream,
+    const int64_t batch_dims,
+    const TArray<int64_t> input_dims,
+    const size_t num_slices,
+    const size_t num_slices_per_batch,
+    const size_t input_batch_stride,
+    const size_t num_slice_dims,
+    const int64_t* const sizes_from_slice_dims_data,  // num_slice_dims elements
+    const TIndex* const indices_data,                 // num_slices * num_slice_dims elements
+    int64_t* const input_slice_offsets_data) {        // num_slices elements
+  const unsigned int blocks_per_grid = static_cast<unsigned int>(CeilDiv(num_slices, GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(_ComputeSliceOffsetsKernel, blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream, 
+      batch_dims,
+      input_dims,
+      num_slices,
+      num_slices_per_batch,
+      input_batch_stride,
+      num_slice_dims,
+      sizes_from_slice_dims_data,
+      indices_data,
+      input_slice_offsets_data);
+}
+
+template <typename T>
+void GatherNDImpl(
+    hipStream_t stream,
+    const size_t num_slices,
+    const void* input_data,
+    void* output_data,
+    const size_t slice_size,
+    const int64_t* input_slice_offsets_data) {
+  const unsigned int blocks_per_grid = static_cast<unsigned int>(CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDKernel<T>), blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream, 
+      num_slices, static_cast<const T*>(input_data), static_cast<T*>(output_data), slice_size, input_slice_offsets_data);
+}
+
+#define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \
+  template void ComputeSliceOffsetsImpl<TIndex>(       \
+      hipStream_t stream,                             \
+      const int64_t batch_dims,                        \
+      const TArray<int64_t> input_dims,                \
+      const size_t num_slices,                         \
+      const size_t num_slices_per_batch,               \
+      const size_t input_batch_stride,                 \
+      const size_t num_slice_dims,                     \
+      const int64_t* const sizes_from_slice_dims_data, \
+      const TIndex* const indices_data,                \
+      int64_t* const input_slice_offsets_data);
+
+#define SPECIALIZED_IMPL(T) \
+  template void GatherNDImpl<T>(hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data);
+
+SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int32_t)
+SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t)
+
+SPECIALIZED_IMPL(bool)
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(int64_t)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+SPECIALIZED_IMPL(half)
+SPECIALIZED_IMPL(double)
+SPECIALIZED_IMPL(BFloat16)
+#endif
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template<typename TIndex>
+void ComputeSliceOffsetsImpl(
+    hipStream_t stream,
+    const int64_t batch_dims,
+    const TArray<int64_t> input_dims,
+    const size_t num_slices,
+    const size_t num_slices_per_batch,
+    const size_t input_batch_stride,
+    const size_t num_slice_dims,
+    const int64_t* const sizes_from_slice_dims_data,  // num_slice_dims elements
+    const TIndex* const indices_data,                 // num_slices * num_slice_dims elements
+    int64_t* const input_slice_offsets_data);         // num_slices elements
+
+template <typename T>
+void GatherNDImpl(
+    hipStream_t stream,
+    const size_t num_slices,
+    const void* input_data,
+    void* output_data,
+    const size_t slice_size,
+    const int64_t* input_slice_offsets_data);
+
+#ifdef ENABLE_TRAINING
+template <typename T>
+void GatherNDGradImpl(
+    hipStream_t stream,
+    const size_t num_slices,
+    const void* update_data,
+    void* output_data,
+    const size_t slice_size,
+    const int64_t* input_slice_offsets_data);
+#endif
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "identity_op.h"
+
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Dropout,
+    kOnnxDomain,
+    7, 9,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(),
+                              DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<double>()})
+        .Alias(0, 0),
+    IdentityOp<true>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Dropout,
+    kOnnxDomain,
+    10,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(),
+                              DataTypeImpl::GetTensorType<float>(),
+                              DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>())
+        .Alias(0, 0),
+    IdentityOp<true>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Identity,
+    kOnnxDomain,
+    1, 12,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .Alias(0, 0),
+    IdentityOp<false>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Identity,
+    kOnnxDomain,
+    13, 13,
+     kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .Alias(0, 0),
+    IdentityOp<false>);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Identity,
+    kOnnxDomain,
+    14,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorAndSequenceTensorTypes())
+        .Alias(0, 0),
+    IdentityOp<false>);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <bool is_dropout>
+class IdentityOp final : public RocmKernel {
+ public:
+  IdentityOp(const OpKernelInfo& info) : RocmKernel(info) {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override {
+    auto X_ml_type = context->InputType(0);
+    if (X_ml_type->IsTensorType()) {
+      const Tensor* X = context->Input<Tensor>(0);
+      if (nullptr == X) {
+        return Status(common::ONNXRUNTIME, common::FAIL,
+                      "IdentityOp rocm: input count mismatch.");
+      }
+      const TensorShape& shape = X->Shape();
+      Tensor* Y = context->Output(0, shape);
+      if (nullptr == Y) {
+        return Status(common::ONNXRUNTIME, common::FAIL,
+                      "IdentityOp rocm: failed to allocate output tensor.");
+      }
+      auto X_type = X->DataType();
+
+      const void* source = X->DataRaw(X_type);
+      void* target = Y->MutableDataRaw(X_type);
+      //If source and target pointers are not equal, we need to copy the data.
+      if (target != source) {
+        HIP_RETURN_IF_ERROR(hipMemcpyAsync(target, source, X->Shape().Size() * X->DataType()->Size(), hipMemcpyDeviceToDevice, Stream()));
+      }
+
+      if (is_dropout) {
+        Tensor* mask = context->Output(1, shape);
+        // a 'nullptr' returned would make it an unused optional output
+        if (mask != nullptr) {
+          // Opset 7 differs with Opset 10 in that the type of the 'mask'
+          // output is tied with the type of the input in Opset 7 whereas
+          // the type of 'mask' in Opset 10 is 'bool' always
+          // so we have a common solution
+          void* mask_data = mask->MutableDataRaw();
+          // In 'test'/'inference' mode, there are no input values dropped out
+          // so fill the buffer with 0/false
+          HIP_RETURN_IF_ERROR(hipMemsetAsync(mask_data, 0, mask->SizeInBytes(), Stream()));
+        }
+      }
+    } else if (X_ml_type->IsTensorSequenceType()) {
+      const TensorSeq* X = context->Input<TensorSeq>(0);
+      ORT_ENFORCE(X != nullptr, "IdentityOp rocm: input tensor is missing.");
+      TensorSeq* Y = context->Output<TensorSeq>(0);
+      ORT_ENFORCE(Y != nullptr, "IdentityOp rocm: failed to allocate output tensor sequence.");
+      if (X == Y) {
+        return Status::OK();
+      }
+      auto X_type = X->DataType();
+      Y->SetType(X_type);
+      AllocatorPtr alloc;
+      auto status = context->GetTempSpaceAllocator(&alloc);
+      if (!status.IsOK()) {
+        return Status(common::ONNXRUNTIME, common::FAIL,
+                      "IdentityOp rocm: unable to get an allocator.");
+      }
+      auto X_size = X->Size();
+      for (size_t i = 0; i < X_size; ++i) {
+        const Tensor& source_tensor = X->Get(i);
+        std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
+                                                               source_tensor.Shape(), alloc);
+        HIP_RETURN_IF_ERROR(hipMemcpyAsync(target_tensor->MutableDataRaw(),
+                                             source_tensor.DataRaw(),
+                                             source_tensor.SizeInBytes(),
+                                             hipMemcpyDeviceToDevice, Stream()));
+        Y->Add(std::move(*target_tensor));
+      }
+    } else {
+      return Status(common::ONNXRUNTIME, common::FAIL,
+                    "IdentityOp rocm: unsupported input type.");
+    }
+    return Status::OK();
+  }
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "nonzero_impl.h"
+#include "core/providers/rocm/shared_inc/rocm_call.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include <hipcub/hipcub.hpp>
+
+namespace onnxruntime {
+namespace rocm {
+
+static const int NONZERO_THREADS_PER_BLOCK = GridDim::maxThreadsPerBlock;
+
+//TODO:check overflow
+int NonZeroCalcBlockCount(int64_t x_size) {
+  return static_cast<int>(CeilDiv(x_size, NONZERO_THREADS_PER_BLOCK));
+}
+
+hipError_t NonZeroCalcPrefixSumTempStorageBytes(
+    hipStream_t stream, int* prefix_counts, int number_of_blocks, size_t& temp_storage_bytes) {
+  temp_storage_bytes = 0;
+  return hipcub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream);
+}
+
+hipError_t NonZeroInclusivePrefixSum(
+    hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks) {
+  return hipcub::DeviceScan::InclusiveSum(
+      d_temp_storage, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream);
+}
+
+template <typename InputT, int THREADS_PER_BLOCK>
+__global__ void NonZeroCountEachBlockKernel(const InputT* x, int64_t x_size, int* count_in_blocks) {
+  typedef hipcub::BlockReduce<int, THREADS_PER_BLOCK, hipcub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  // const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
+  int nz = 0;
+  if (index < x_size && bool(x[index])) ++nz;
+  int count = BlockReduceT(temp_storage).Sum(nz);
+
+  if (threadIdx.x == 0) {
+    count_in_blocks[blockIdx.x] = count;
+  }
+}
+
+template <typename InputT, int THREADS_PER_BLOCK>
+__global__ void NonZeroOutputPositionsKernel(
+    const InputT* x, int64_t x_size, int x_rank, const TArray<fast_divmod> x_strides,
+    const int* prefix_counts, int nonzero_elements, int64_t* results) {
+  typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScanT;
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  // const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
+  int nz = 0;
+  if (index < x_size && bool(x[index])) ++nz;
+  int pos_in_block = 0;
+  BlockScanT(temp_storage).InclusiveSum(nz, pos_in_block);
+
+  int result_position = ((blockIdx.x == 0) ? 0 : prefix_counts[blockIdx.x - 1]) + pos_in_block - nz;
+
+  if (index < x_size && bool(x[index])) {
+    int remain = (int)index, dim = 0;
+    for (int axis = 0, rp = result_position; axis < x_rank; ++axis, rp += nonzero_elements) {
+      x_strides[axis].divmod(remain, dim, remain);
+      results[rp] = (int64_t)dim;
+    }
+  }
+}
+
+
+constexpr int MAX_DIMS = 16;
+
+template <typename InputT, int THREADS_PER_BLOCK>
+__global__ void UnRolledNonZeroOutputPositionsKernel(
+    const InputT* x, int64_t x_size, int x_rank, const TArray<fast_divmod> x_strides,
+    const int* prefix_counts, int nonzero_elements, int64_t* results) {
+  typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScanT;
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  // const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
+  int nz = 0;
+  if (index < x_size && bool(x[index])) ++nz;
+  int pos_in_block = 0;
+  BlockScanT(temp_storage).InclusiveSum(nz, pos_in_block);
+
+  int result_position = ((blockIdx.x == 0) ? 0 : prefix_counts[blockIdx.x - 1]) + pos_in_block - nz;
+
+  if (index < x_size && bool(x[index])) {
+    int remain = (int)index, dim = 0;
+    int rp = result_position;
+    #pragma unroll
+    for (int axis = 0; axis < MAX_DIMS; ++axis) {
+      if (axis == x_rank) {
+        break;
+      }
+      x_strides[axis].divmod(remain, dim, remain);
+      results[rp] = (int64_t)dim;
+      rp += nonzero_elements;
+    }
+  }
+}
+
+template <typename InputT>
+hipError_t NonZeroCountEachBlock(hipStream_t stream, const InputT* x, int64_t x_size, int* count_in_blocks) {
+  int num_blocks = NonZeroCalcBlockCount(x_size);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(NonZeroCountEachBlockKernel<InputT, NONZERO_THREADS_PER_BLOCK>), num_blocks, NONZERO_THREADS_PER_BLOCK, 0, stream, 
+      x, x_size, count_in_blocks);
+  return hipSuccess;
+}
+
+template <typename InputT>
+hipError_t NonZeroOutputPositions(
+    hipStream_t stream, const InputT* x, int64_t x_size, int x_rank, const TArray<fast_divmod>& x_strides,
+    const int* prefix_counts, int nonzero_elements, int64_t* results) {
+  int num_blocks = NonZeroCalcBlockCount(x_size);
+  if (x_rank > MAX_DIMS) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(NonZeroOutputPositionsKernel<InputT, NONZERO_THREADS_PER_BLOCK>), num_blocks, NONZERO_THREADS_PER_BLOCK, 0, stream, 
+      x, x_size, x_rank, x_strides,
+      prefix_counts, nonzero_elements, results);
+  } else {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(UnRolledNonZeroOutputPositionsKernel<InputT, NONZERO_THREADS_PER_BLOCK>), num_blocks, NONZERO_THREADS_PER_BLOCK, 0, stream, 
+      x, x_size, x_rank, x_strides,
+      prefix_counts, nonzero_elements, results);
+  }
+  return hipSuccess;
+}
+
+template hipError_t NonZeroCountEachBlock(hipStream_t stream, const bool*, int64_t, int*);
+template hipError_t NonZeroCountEachBlock(hipStream_t stream, const uint8_t*, int64_t, int*);
+template hipError_t NonZeroCountEachBlock(hipStream_t stream, const int64_t*, int64_t, int*);
+template hipError_t NonZeroCountEachBlock(hipStream_t stream, const int32_t*, int64_t, int*);
+template hipError_t NonZeroCountEachBlock(hipStream_t stream, const float*, int64_t, int*);
+template hipError_t NonZeroCountEachBlock(hipStream_t stream, const half*, int64_t, int*);
+
+template hipError_t NonZeroOutputPositions(hipStream_t stream, const bool*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
+template hipError_t NonZeroOutputPositions(hipStream_t stream, const uint8_t*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
+template hipError_t NonZeroOutputPositions(hipStream_t stream, const int64_t*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
+template hipError_t NonZeroOutputPositions(hipStream_t stream, const int32_t*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
+template hipError_t NonZeroOutputPositions(hipStream_t stream, const float*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
+template hipError_t NonZeroOutputPositions(hipStream_t stream, const half*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+int NonZeroCalcBlockCount(int64_t x_size);
+
+hipError_t NonZeroCalcPrefixSumTempStorageBytes(hipStream_t stream, int* prefix_counts, int number_of_blocks, size_t& );
+
+hipError_t NonZeroInclusivePrefixSum(hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks);
+
+// count nonzero elements in each block into counts_in_blocks, 
+// the counts_in_blocks buffer is pre-allocated on gpu first.
+template<typename InputT>
+hipError_t NonZeroCountEachBlock(hipStream_t stream, const InputT* x, int64_t x_size, int* counts_in_blocks);
+
+// output nonzero positions using input x and prefix_counts for each blocks
+template<typename InputT>
+hipError_t NonZeroOutputPositions(
+    hipStream_t stream, const InputT *x, int64_t x_size, int x_rank, const TArray<fast_divmod>& x_strides,
+    const int* prefix_counts, int nonzero_elements, int64_t* results);
+
+}  // namespace rocm
+}  // namespace onnxruntime
+
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "nonzero_op.h"
+#include "nonzero_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// kernel builder functions
+#define NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(type, type_name)                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                    \
+      NonZero,                                                                                \
+      kOnnxDomain,                                                                            \
+      9, 12,                                                                                  \
+      type_name,                                                                              \
+      kRocmExecutionProvider,                                                                 \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
+      NonZero<type>)                                                                          \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                              \
+      NonZero,                                                                                \
+      kOnnxDomain,                                                                            \
+      13,                                                                                     \
+      type_name,                                                                              \
+      kRocmExecutionProvider,                                                                 \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
+      NonZero<type>)
+
+#define NONZERO_TYPED_KERNEL(type) \
+  NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(type, type)
+
+// start with a subset of types, enable more as needed...
+NONZERO_TYPED_KERNEL(bool)
+NONZERO_TYPED_KERNEL(uint8_t)
+//NONZERO_TYPED_KERNEL(uint16_t)
+//NONZERO_TYPED_KERNEL(uint32_t)
+//NONZERO_TYPED_KERNEL(uint64_t)
+//NONZERO_TYPED_KERNEL(int8_t)
+//NONZERO_TYPED_KERNEL(int16_t)
+NONZERO_TYPED_KERNEL(int32_t)
+NONZERO_TYPED_KERNEL(int64_t)
+NONZERO_TYPED_KERNEL(MLFloat16)
+//NONZERO_TYPED_KERNEL(BFloat16)
+NONZERO_TYPED_KERNEL(float)
+//NONZERO_TYPED_KERNEL(double)
+//NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(std::string, string)
+
+#undef NONZERO_TYPED_KERNEL
+#undef NONZERO_TYPED_KERNEL_WITH_TYPE_NAME
+
+template <typename T>
+Status NonZero<T>::ComputeInternal(OpKernelContext* context) const {
+  static const TensorShape kScalarDims{1};
+  const auto x = context->Input<Tensor>(0);
+
+  int nonzero_elements = 0;
+  const auto& x_shape = x->Shape();
+  const int x_rank = x_shape.IsScalar() ? 1 : static_cast<int>(x_shape.NumDimensions());
+  auto x_dims = (x_shape.IsScalar()) ? kScalarDims.GetDims() : x_shape.GetDims();
+  const int64_t x_size = x_shape.Size();
+  if (x_size > 0) {
+    auto x_data = reinterpret_cast<const typename ToHipType<T>::MappedType*>(x->Data<T>());
+
+    const int number_of_blocks = NonZeroCalcBlockCount(x_size);
+    auto prefix_buffer = GetScratchBuffer<int>(number_of_blocks);
+    int* prefix_counts = prefix_buffer.get();
+    HIP_RETURN_IF_ERROR(NonZeroCountEachBlock(Stream(), x_data, x_size, prefix_counts));
+
+    size_t temp_storage_bytes = 0;
+    HIP_RETURN_IF_ERROR(NonZeroCalcPrefixSumTempStorageBytes(Stream(), prefix_counts, number_of_blocks, temp_storage_bytes));
+    auto temp_buffer = GetScratchBuffer<uint8_t>(temp_storage_bytes);
+    auto d_temp_storage = temp_buffer.get();
+    HIP_RETURN_IF_ERROR(NonZeroInclusivePrefixSum(Stream(), d_temp_storage, temp_storage_bytes, prefix_counts, number_of_blocks));
+
+    // hipMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
+    HIP_RETURN_IF_ERROR(hipMemcpyAsync(
+        &nonzero_elements, prefix_counts + number_of_blocks - 1,
+        sizeof(int), hipMemcpyDeviceToHost, Stream()));
+
+    TArray<fast_divmod> fdm_x_strides(x_rank);
+    TensorPitches x_strides(x_dims);
+    for (auto i = 0; i < x_rank; i++) {
+      fdm_x_strides[i] = fast_divmod(static_cast<int>(x_strides[i]));
+    }
+
+    auto* output_tensor = context->Output(0, {x_rank, nonzero_elements});
+    ORT_ENFORCE(output_tensor, "failed to get first output!");
+    HIP_RETURN_IF_ERROR(NonZeroOutputPositions(
+        Stream(), x_data, x_size, x_rank, fdm_x_strides,
+        prefix_counts, nonzero_elements, output_tensor->MutableData<int64_t>()));
+  } else {
+    context->Output(0, {x_rank, nonzero_elements});
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class NonZero final : public RocmKernel {
+ public:
+  NonZero(const OpKernelInfo& info) : RocmKernel(info) {}
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/tensor/onehot.h"
+
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+namespace rocm {
+
+// T1: indices, T2: depth, T3: values
+#define REGISTER_TYPED_ONE_HOT_OP(in_type, out_type, depth_type)           \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                           \
+      OneHot,                                                              \
+      kOnnxDomain,                                                         \
+      11,                                                                  \
+      in_type##_##out_type##_##depth_type,                                 \
+      kRocmExecutionProvider,                                              \
+      (*KernelDefBuilder::Create())                                        \
+          .InputMemoryType(OrtMemTypeCPUInput, 1) /* Keep depth in CPU */  \
+          .InputMemoryType(OrtMemTypeCPUInput, 2) /* Keep values in CPU */ \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<in_type>())    \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<depth_type>()) \
+          .TypeConstraint("T3", DataTypeImpl::GetTensorType<out_type>()),  \
+      OneHotOp<in_type, out_type, depth_type>);
+
+REGISTER_TYPED_ONE_HOT_OP(int64_t, int64_t, int64_t)
+REGISTER_TYPED_ONE_HOT_OP(int64_t, float, int64_t)
+REGISTER_TYPED_ONE_HOT_OP(int32_t, float, int32_t)
+REGISTER_TYPED_ONE_HOT_OP(int64_t, MLFloat16, int64_t)
+REGISTER_TYPED_ONE_HOT_OP(int32_t, MLFloat16, int32_t)
+
+template <typename in_type, typename out_type, typename depth_type>
+Status OneHotOp<in_type, out_type, depth_type>::ComputeInternal(OpKernelContext* ctx) const {
+  typedef typename ToHipType<out_type>::MappedType HipT_Out;
+
+  const Tensor* indices = ctx->Input<Tensor>(0);
+  const Tensor* depth = ctx->Input<Tensor>(1);
+  const Tensor* values = ctx->Input<Tensor>(2);
+
+  ORT_RETURN_IF_ERROR(ValidateInputs(depth, values));
+
+  const auto* depth_data = depth->Data<depth_type>();
+  const auto depth_val = static_cast<int64_t>(
+      *depth_data);  // As per spec in case 'depth' is of non-integer type, it will be casted to int64 before use.
+  if (depth_val <= 0) {
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Depth is negative.");
+  }
+
+  // prepare output shape
+  int64_t prefix_dim_size, suffix_dim_size;
+  TensorShapeVector output_shape;
+  ORT_RETURN_IF_ERROR(PrepareOutputShape(indices, depth_val, axis_, prefix_dim_size, suffix_dim_size, output_shape));
+
+  // allocate output
+  const auto* values_data = reinterpret_cast<const HipT_Out*>(values->Data<out_type>());
+  Tensor* output = ctx->Output(0, TensorShape(output_shape));
+
+  // edge case where we have a dim with a value of 0
+  if (output->Shape().Size() == 0)
+    return Status::OK();
+
+  const fast_divmod fdm_suffix(gsl::narrow_cast<int>(suffix_dim_size));
+  const auto* indices_data = indices->Data<in_type>();
+  auto* output_data = reinterpret_cast<HipT_Out*>(output->MutableData<out_type>());
+
+  if (values_data[0] == HipT_Out(0.f)) {
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes(), Stream()));
+    OneHotWithZeroOffValueImpl(Stream(),
+                               indices_data,
+                               fdm_suffix,
+                               depth_val,
+                               values_data[1],
+                               output_data,
+                               indices->Shape().Size());
+    return Status::OK();
+  }
+
+  const fast_divmod fdm_depth_suffix(gsl::narrow_cast<int>(depth_val * suffix_dim_size));
+  OneHotImpl(Stream(),
+             indices_data, fdm_depth_suffix, fdm_suffix, depth_val,
+             values_data[1],
+             values_data[0],
+             output_data,
+             output->Shape().Size());
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/rocm_common.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/tensor/onehot.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename in_type, typename out_type>
+__global__ void _OneHotImpl(
+    const in_type* indices_data,
+    const fast_divmod fdm_depth_suffix,
+    const fast_divmod fdm_suffix,
+    const int64_t depth_val,
+    const out_type on_value,
+    const out_type off_value,
+    out_type* output_data,
+    HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  int prefix_index, prefix_offset;
+  fdm_depth_suffix.divmod(id, prefix_index, prefix_offset);
+
+  int depth_index, suffix_index;
+  fdm_suffix.divmod(prefix_offset, depth_index, suffix_index);
+
+  HIP_LONG indices_index = prefix_index * fdm_suffix.d_ + suffix_index;
+
+  // handle index outside the range [-depth, depth-1] case
+  bool is_valid_range = indices_data[indices_index] >= -depth_val && indices_data[indices_index] < depth_val;
+
+  // handle negative index
+  in_type adjusted_indice = (indices_data[indices_index] + depth_val) % depth_val;
+
+  output_data[id] = (is_valid_range && adjusted_indice == in_type(depth_index)) ? on_value : off_value;
+}
+
+template<typename in_type, typename out_type>
+__global__ void _OneHotWithZeroOffValueImpl(
+    const in_type* indices_data,
+    const fast_divmod fdm_suffix,
+    const int64_t depth_val,
+    const out_type on_value,
+    out_type* output_data,
+    HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  if (indices_data[id] >= -depth_val && indices_data[id] < depth_val) {
+    in_type adjusted_index = indices_data[id] >= 0 ? indices_data[id] : indices_data[id] + depth_val;
+    int q, r;
+    fdm_suffix.divmod(id, q, r);
+    output_data[(q * depth_val + adjusted_index) * fdm_suffix.d_ + r] = on_value;
+  }
+}
+
+template <typename in_type, typename out_type>
+void OneHotImpl(
+    hipStream_t stream,
+    const in_type* indices_data,
+    const fast_divmod fdm_depth_suffix,
+    const fast_divmod fdm_suffix,
+    const int64_t depth_val,
+    const out_type on_value,
+    const out_type off_value,
+    out_type* output_data,
+    size_t count) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
+  HIP_LONG N = static_cast<HIP_LONG>(count);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_OneHotImpl<in_type, out_type>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+    indices_data,
+    fdm_depth_suffix,
+    fdm_suffix,
+    depth_val,
+    on_value,
+    off_value,
+    output_data,
+    N);
+}
+
+template <typename in_type, typename out_type>
+void OneHotWithZeroOffValueImpl(
+    hipStream_t stream,
+    const in_type* indices_data,
+    const fast_divmod fdm_suffix,
+    const int64_t depth_val,
+    const out_type on_value,
+    out_type* output_data,
+    size_t count) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
+  HIP_LONG N = static_cast<HIP_LONG>(count);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_OneHotWithZeroOffValueImpl<in_type, out_type>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+    indices_data,
+    fdm_suffix,
+    depth_val,
+    on_value,
+    output_data,
+    N);
+}
+
+#define SPECIALIZED_OneHotImpl(in_type, out_type) \
+  template void OneHotImpl(                       \
+    hipStream_t stream,                          \
+    const in_type* indices_data,                  \
+    const fast_divmod fdm_depth_suffix,           \
+    const fast_divmod fdm_suffix,                 \
+    const int64_t depth_val,                      \
+    const out_type on_value,                      \
+    const out_type off_value,                     \
+    out_type* output_data,                        \
+    size_t count);
+
+SPECIALIZED_OneHotImpl(int64_t, int64_t)
+SPECIALIZED_OneHotImpl(int64_t, float)
+SPECIALIZED_OneHotImpl(int32_t, float)
+SPECIALIZED_OneHotImpl(int64_t, half)
+SPECIALIZED_OneHotImpl(int32_t, half)
+
+#define SPECIALIZED_OneHotWithZeroOffValueImpl(in_type, out_type) \
+  template void OneHotWithZeroOffValueImpl(                       \
+    hipStream_t stream,                                          \
+    const in_type* indices_data,                                  \
+    const fast_divmod fdm_suffix,                                 \
+    const int64_t depth_val,                                      \
+    const out_type on_value,                                      \
+    out_type* output_data,                                        \
+    size_t count);
+
+SPECIALIZED_OneHotWithZeroOffValueImpl(int64_t, int64_t)
+SPECIALIZED_OneHotWithZeroOffValueImpl(int64_t, float)
+SPECIALIZED_OneHotWithZeroOffValueImpl(int32_t, float)
+SPECIALIZED_OneHotWithZeroOffValueImpl(int64_t, half)
+SPECIALIZED_OneHotWithZeroOffValueImpl(int32_t, half)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename in_type, typename out_type>
+void OneHotImpl(
+    hipStream_t stream,
+    const in_type* indices,
+    const fast_divmod fdm_depth_suffix,
+    const fast_divmod fdm_suffix,
+    const int64_t depth_val,
+    const out_type on_value,
+    const out_type off_value,
+    out_type* output,
+    size_t count);
+
+template <typename in_type, typename out_type>
+void OneHotWithZeroOffValueImpl(
+    hipStream_t stream,
+    const in_type* indices,
+    const fast_divmod fdm_suffix,
+    const int64_t depth_val,
+    const out_type on_value,
+    out_type* output,
+    size_t count);
+
+template <typename in_type, typename out_type, typename depth_type>
+class OneHotOp final : public RocmKernel {
+ public:
+  explicit OneHotOp(const OpKernelInfo& info) : RocmKernel(info) {
+    int64_t tmp_axis;
+    if (info.GetAttr<int64_t>("axis", &tmp_axis).IsOK()) {
+      axis_ = tmp_axis;
+    }
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OneHotOp);
+
+  int64_t axis_ = -1;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "pad.h"
+#include "pad_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Pad,                                                        \
+      kOnnxDomain,                                                \
+      2, 10,                                                      \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Pad<T>);                                                    \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Pad,                                                        \
+      kOnnxDomain,                                                \
+      11, 12,                                                     \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                 \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                 \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Pad<T>);                                                    \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      Pad,                                                        \
+      kOnnxDomain,                                                \
+      13,                                                         \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                 \
+          .InputMemoryType(OrtMemTypeCPUInput, 2)                 \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Pad<T>);
+
+using PadsVector = PadBase::PadsVector;
+
+static bool IsNCHWInputWithPaddingAlongHAndW(size_t input_rank,
+                                             const TArray<int64_t>& lower_pads,
+                                             const TArray<int64_t>& upper_pads) {
+  if (input_rank == 2) {  // N = 1 and C = 1
+    return true;
+  }
+
+  // Is CHW input AND no padding along C dim
+  if (input_rank == 3 &&
+      lower_pads[0] == 0 &&  // start padding along C
+      upper_pads[0] == 0) {  // end padding along C
+    return true;
+  }
+
+  // Is NCHW input AND no padding along N and C dims
+  if (input_rank == 4 &&
+      lower_pads[0] == 0 && lower_pads[1] == 0 &&  // start padding along N and C
+      upper_pads[0] == 0 && upper_pads[1] == 0) {  // end padding along N and C
+    return true;
+  }
+
+  return false;
+}
+
+template <typename T>
+typename ToHipType<T>::MappedType ToCudaValue(const T& value) {
+  return value;
+}
+
+template <>
+typename ToHipType<MLFloat16>::MappedType ToCudaValue<MLFloat16>(const MLFloat16& value) {
+  return *reinterpret_cast<const typename ToHipType<MLFloat16>::MappedType*>(&value.val);
+}
+
+template <typename T>
+Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+  const auto& input_tensor = *ctx->Input<Tensor>(0);
+  auto const& input_shape = input_tensor.Shape();
+  int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
+
+  const PadsVector* p_pads = &pads_;
+  const PadsVector* p_slices = &slices_;
+  HipT value = ToHipType<T>::FromFloat(value_);
+
+  // kOnnxDomain Pad opset >= 11 (Or) kMsDomain opset == 1
+  PadsVector pads;
+  PadsVector slices;
+  if (is_dynamic_) {
+    const Tensor& pads_tensor = *ctx->Input<Tensor>(1);
+    const auto pads_tensor_dims = pads_tensor.Shape().GetDims();
+    ORT_ENFORCE(utils::IsPrimitiveDataType<int64_t>(pads_tensor.DataType()),
+                "Pads tensor should be an INT64 tensor");
+    ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
+                "Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]");
+
+    const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
+    size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
+    ORT_ENFORCE(pads_size == 2 * static_cast<size_t>(dimension_count),
+                "Pads tensor size should be equal to twice the input dimension count ");
+
+    pads.reserve(2LL * dimension_count);
+    for (size_t i = 0; i < pads_size; ++i) {
+      pads.push_back(pads_tensor_raw_data[i]);
+    }
+    // Separate out any negative pads into the slices array
+    slices.resize(pads.size(), 0);
+    for (size_t index = 0; index < pads.size(); index++) {
+      if (pads[index] < 0) {
+        slices[index] = pads[index];
+        pads[index] = 0;
+      }
+    }
+
+    T raw_value{};
+    const Tensor* value_tensor = ctx->Input<Tensor>(2);
+    if (nullptr != value_tensor) {
+      ORT_ENFORCE(utils::IsPrimitiveDataType<T>(value_tensor->DataType()) &&
+                      value_tensor->Shape().Size() == 1,
+                  "Value tensor should be a 1D tensor of size 1 with the same type as that of the input tensor");
+      raw_value = value_tensor->Data<T>()[0];
+      value = ToCudaValue<T>(raw_value);
+    }
+    p_pads = &pads;
+    p_slices = &slices;
+  }
+
+  TensorPitches input_pitches(input_shape.GetDims());
+  TArray<int64_t> input_dims(input_shape.GetDims());
+  TArray<int64_t> input_strides(input_pitches);
+
+  auto output_dims(input_shape.AsShapeVector());
+  ORT_ENFORCE(static_cast<size_t>(dimension_count * 2) == p_pads->size(), "'pads' attribute has wrong number of values");
+
+  // Calculate output dimensions, and handle any negative padding
+  TArray<int64_t> lower_pads(dimension_count);
+  TArray<int64_t> upper_pads(dimension_count);
+  for (auto i = 0; i < dimension_count; i++) {
+    lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
+    upper_pads[i] = (*p_pads)[i + dimension_count] + (*p_slices)[i + dimension_count];
+    output_dims[i] += lower_pads[i] + upper_pads[i];
+  }
+
+  TensorShape output_shape(output_dims);
+
+  // special case when there is a dim value of 0 in the shape. behavior depends on mode
+  if (input_shape.Size() == 0) {
+    ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
+  }
+
+  auto& output_tensor = *ctx->Output(0, output_shape);
+
+  if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
+      std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
+      output_shape.Size() > 0) {
+    HIP_RETURN_IF_ERROR(hipMemcpyAsync(
+        output_tensor.MutableData<T>(), input_tensor.Data<T>(),
+        sizeof(typename ToHipType<T>::MappedType) * output_shape.Size(),
+        hipMemcpyDeviceToDevice, Stream()));
+    return Status::OK();
+  }
+
+  if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
+    // If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)
+
+    // NCHW input
+    int height_dim = 2;
+    int width_dim = 3;
+
+    if (dimension_count == 3) {  // CHW input
+      height_dim = 1;
+      width_dim = 2;
+    } else if (dimension_count == 2) {  // HW input
+      height_dim = 0;
+      width_dim = 1;
+    }
+
+    PadNCHWInputWithPaddingAlongHAndWImpl(
+        Stream(),
+        dimension_count == 4 ? input_dims[0] : 1,
+        dimension_count == 4 ? input_dims[1] : (dimension_count == 3 ? input_dims[0] : 1),
+        input_dims[height_dim],
+        output_dims[height_dim],
+        input_dims[width_dim],
+        output_dims[width_dim],
+        lower_pads[height_dim],
+        lower_pads[width_dim],
+        value,
+        static_cast<int>(mode_),
+        reinterpret_cast<const typename ToHipType<T>::MappedType*>(input_tensor.Data<T>()),
+        reinterpret_cast<typename ToHipType<T>::MappedType*>(output_tensor.MutableData<T>()),
+        output_tensor.Shape().Size());
+
+    return Status::OK();
+  }
+
+  TArray<fast_divmod> fdm_output_strides(dimension_count);
+  TensorPitches output_strides(output_dims);
+  for (auto i = 0; i < dimension_count; i++) {
+    fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));
+  }
+
+  PadImpl(
+      Stream(),
+      dimension_count,
+      input_dims,
+      input_strides,
+      lower_pads,
+      value,
+      static_cast<int>(mode_),
+      reinterpret_cast<const typename ToHipType<T>::MappedType*>(input_tensor.Data<T>()),
+      fdm_output_strides,
+      reinterpret_cast<typename ToHipType<T>::MappedType*>(output_tensor.MutableData<T>()),
+      output_tensor.Shape().Size());
+
+  return Status::OK();
+}
+
+#define SPECIALIZED_COMPUTE(T) \
+  REGISTER_KERNEL_TYPED(T)     \
+  template Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const;
+
+SPECIALIZED_COMPUTE(float)
+SPECIALIZED_COMPUTE(double)
+SPECIALIZED_COMPUTE(MLFloat16)
+SPECIALIZED_COMPUTE(bool)
+
+}  // namespace rocm
+};  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/tensor/padbase.h"
+
+using onnxruntime::PadBase;
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class Pad final : public PadBase, public RocmKernel {
+ public:
+  Pad(const OpKernelInfo& info) : PadBase(info), RocmKernel(info) {}
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "pad_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// PadMode enum from core/providers/cpu/tensor/pad.h, cannot use that header because of nvcc/onnxruntime incompatibility
+enum class PadMode : int {
+  Constant = 0,
+  Reflect,
+  Edge
+};
+
+template <typename T, int pad_mode>
+__global__ void _PadKernel(
+    const size_t shape_rank,
+    const TArray<int64_t> input_dims,
+    const TArray<int64_t> input_strides,
+    const TArray<int64_t> lower_pads,
+    const T pad_value,
+    const T* input_data,
+    const TArray<fast_divmod> fdm_output_strides,
+    T* output_data,
+    const size_t N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  HIP_LONG input_index = 0;
+  HIP_LONG output_index = id;
+  bool use_pad_value = false;
+  for (int dim = 0; dim < shape_rank && !use_pad_value; ++dim) {
+    int out_coord, r;
+    fdm_output_strides[dim].divmod(output_index, out_coord, r);
+    output_index = r;
+    int in_coord = 0;
+    if (out_coord < lower_pads[dim]) {
+      switch ((PadMode)pad_mode) {
+        case PadMode::Constant:
+          use_pad_value = true;
+          break;
+        case PadMode::Edge:
+          in_coord = 0;
+          break;
+        case PadMode::Reflect:
+          in_coord = lower_pads[dim] - out_coord;
+          break;
+      }
+    } else if (out_coord >= lower_pads[dim] + input_dims[dim]) {
+      switch ((PadMode)pad_mode) {
+        case PadMode::Constant:
+          use_pad_value = true;
+          break;
+        case PadMode::Edge:
+          in_coord = input_dims[dim] - 1;
+          break;
+        case PadMode::Reflect:
+          in_coord = input_dims[dim] - 2 - (out_coord - (lower_pads[dim] + input_dims[dim]));
+          break;
+      }
+    } else {
+      in_coord = out_coord - lower_pads[dim];
+    }
+    input_index += input_strides[dim] * in_coord;
+  }
+  output_data[id] = use_pad_value ? (T)pad_value : input_data[input_index];
+}
+
+template <typename T, int pad_mode>
+__global__ void _PadNCHWInputWithPaddingAlongHAndWKernel(
+    const int64_t n,  // Batch
+    const int64_t c,  // Channel
+    const int64_t input_height,
+    const int64_t output_height,
+    const int64_t input_width,
+    const int64_t output_width,
+    const int64_t pad_height_start,
+    const int64_t pad_width_start,
+    const T pad_value,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  const int current_output_width = id % output_width;
+  int nc_index = id / output_width;
+  const int current_output_height = nc_index % output_height;
+  nc_index /= output_height;
+
+  int current_input_height = current_output_height - pad_height_start;
+  int current_input_width = current_output_width - pad_width_start;
+
+  switch ((PadMode)pad_mode) {
+    case PadMode::Constant:
+      output_data[id] = (current_input_height < 0 ||
+                         current_input_width < 0 ||
+                         current_input_height >= input_height ||
+                         current_input_width >= input_width)
+                            ? pad_value
+                            : input_data[(nc_index * input_height +
+                                          current_input_height) *
+                                             input_width +
+                                         current_input_width];
+      break;
+
+    case PadMode::Edge:
+      current_input_height = std::max(0, std::min(current_input_height, static_cast<int>(input_height - 1)));
+      current_input_width = std::max(0, std::min(current_input_width, static_cast<int>(input_width - 1)));
+      output_data[id] = input_data[(nc_index * input_height +
+                                    current_input_height) *
+                                       input_width +
+                                   current_input_width];
+      break;
+
+    case PadMode::Reflect:
+      current_input_height = std::max(current_input_height, -current_input_height);
+      current_input_height = std::min(static_cast<int>(current_input_height),
+                                      2 * static_cast<int>(input_height) - current_input_height - 2);
+
+      current_input_width = std::max(current_input_width, -current_input_width);
+      current_input_width = std::min(static_cast<int>(current_input_width),
+                                     2 * static_cast<int>(input_width) - current_input_width - 2);
+
+      output_data[id] = input_data[(nc_index * input_height +
+                                    current_input_height) *
+                                       input_width +
+                                   current_input_width];
+      break;
+  }
+}
+
+template <typename T>
+void PadImpl(
+    hipStream_t stream,
+    const size_t shape_rank,
+    const TArray<int64_t>& input_dims,
+    const TArray<int64_t>& input_strides,
+    const TArray<int64_t>& lower_pads,
+    const T pad_value,
+    const int pad_mode,
+    const T* input_data,
+    const TArray<fast_divmod>& fdm_output_strides,
+    T* output_data,
+    const size_t N) {
+  if (N == 0)  // special case where there's a dim value of 0 in the output shape
+    return;
+
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  switch (pad_mode) {
+    case 0:
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadKernel<T, 0>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_dims, input_strides, lower_pads,
+          pad_value, input_data, fdm_output_strides, output_data, N);
+      break;
+    case 1:
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadKernel<T, 1>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_dims, input_strides, lower_pads,
+          pad_value, input_data, fdm_output_strides, output_data, N);
+      break;
+    case 2:
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadKernel<T, 2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_dims, input_strides, lower_pads,
+          pad_value, input_data, fdm_output_strides, output_data, N);
+      break;
+  }
+}
+
+template <typename T>
+void PadNCHWInputWithPaddingAlongHAndWImpl(
+    hipStream_t stream,
+    const int64_t n,  // Batch
+    const int64_t c,  // Channel
+    const int64_t input_height,
+    const int64_t output_height,
+    const int64_t input_width,
+    const int64_t output_width,
+    const int64_t pad_height_start,
+    const int64_t pad_width_start,
+    const T pad_value,
+    const int pad_mode,
+    const T* input_data,
+    T* output_data,
+    const size_t N) {
+  if (N == 0)  // special case where there's a dim value of 0 in the output shape
+    return;
+
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  switch (pad_mode) {
+    case 0:
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadNCHWInputWithPaddingAlongHAndWKernel<T, 0>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          n, c, input_height, output_height, input_width, output_width,
+          pad_height_start, pad_width_start,
+          pad_value, input_data, output_data, N);
+      break;
+    case 1:
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadNCHWInputWithPaddingAlongHAndWKernel<T, 1>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          n, c, input_height, output_height, input_width, output_width,
+          pad_height_start, pad_width_start,
+          pad_value, input_data, output_data, N);
+      break;
+    case 2:
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadNCHWInputWithPaddingAlongHAndWKernel<T, 2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          n, c, input_height, output_height, input_width, output_width,
+          pad_height_start, pad_width_start,
+          pad_value, input_data, output_data, N);
+      break;
+  }
+}
+
+#define SPECIALIZED_IMPL(T)                                                                                       \
+  template void PadImpl<T>(hipStream_t stream, const size_t shape_rank,                                          \
+                           const TArray<int64_t>& input_dims, const TArray<int64_t>& input_strides,               \
+                           const TArray<int64_t>& lower_pads,                                                     \
+                           const T pad_value,                                                                     \
+                           const int pad_mode,                                                                    \
+                           const T* input_data,                                                                   \
+                           const TArray<fast_divmod>& fdm_output_strides,                                         \
+                           T* output_data,                                                                        \
+                           const size_t N);                                                                       \
+  template void PadNCHWInputWithPaddingAlongHAndWImpl<T>(hipStream_t stream, const int64_t n, const int64_t c,   \
+                                                         const int64_t input_height, const int64_t output_height, \
+                                                         const int64_t input_width, const int64_t output_width,   \
+                                                         const int64_t pad_height_start,                          \
+                                                         const int64_t pad_width_start,                           \
+                                                         const T pad_value,                                       \
+                                                         const int pad_mode,                                      \
+                                                         const T* input_data, T* output_data,                     \
+                                                         const size_t N);
+
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double)
+SPECIALIZED_IMPL(half)
+SPECIALIZED_IMPL(bool)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+void PadNCHWInputWithPaddingAlongHAndWImpl(
+    hipStream_t stream,
+    const int64_t n,  // Batch
+    const int64_t c,  // Channel
+    const int64_t input_height,
+    const int64_t output_height,
+    const int64_t input_width,
+    const int64_t output_width,
+    const int64_t pad_height_start,
+    const int64_t pad_width_start,
+    const T pad_value,
+    const int pad_mode,
+    const T* input_data,
+    T* output_data,
+    const size_t N);
+
+template <typename T>
+void PadImpl(
+    hipStream_t stream,
+    const size_t shape_rank,
+    const TArray<int64_t>& input_dims,
+    const TArray<int64_t>& input_strides,
+    const TArray<int64_t>& lower_pads,
+    const T pad_value,
+    const int pad_mode,
+    const T* input_data,
+    const TArray<fast_divmod>& fdm_output_strides,
+    T* output_data,
+    const size_t N);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "quantize_linear.h"
+#include "quantize_linear.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <class T, class U>
+Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
+  typedef typename ToHipType<U>::MappedType CudaU;
+
+  auto& x = *ctx->Input<Tensor>(0);
+  auto& y_scale = *ctx->Input<Tensor>(1);
+  auto* y_zero_point = ctx->Input<Tensor>(2);
+
+  auto& y = *ctx->Output(0, x.Shape());
+
+  const auto& x_shape = x.Shape();
+
+  const CudaU* input = reinterpret_cast<const CudaU*>(x.Data<U>());
+  T* output = y.MutableData<T>();
+
+  // TO DO: support per-channel
+  ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
+
+  const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
+  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
+  const auto num_of_elements = x_shape.Size();
+
+  ORT_RETURN_IF_ERROR(CudaQuantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements));
+
+  return Status::OK();
+}
+
+template <class T, class U>
+Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
+  typedef typename ToHipType<U>::MappedType CudaU;
+
+  auto& x = *ctx->Input<Tensor>(0);
+  auto& y_scale = *ctx->Input<Tensor>(1);
+  auto* y_zero_point = ctx->Input<Tensor>(2);
+
+  const auto& x_shape = x.Shape();
+
+  auto& y = *ctx->Output(0, x_shape);
+
+  const T* input = x.Data<T>();
+  CudaU* output = reinterpret_cast<CudaU*>(y.MutableData<U>());
+
+  ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
+  ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
+
+  const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
+  const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
+  const auto num_of_elements = x_shape.Size();
+
+  ORT_RETURN_IF_ERROR(CudaDequantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements));
+
+  return Status::OK();
+}
+
+// register QuantizeLinear kernels
+#define REGISTER_Q_KERNEL_TYPED(T)                                    \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                      \
+      QuantizeLinear,                                                 \
+      kOnnxDomain,                                                    \
+      10,                                                             \
+      T,                                                              \
+      kRocmExecutionProvider,                                         \
+      (*KernelDefBuilder::Create())                                   \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()) \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),    \
+      QuantizeLinear<T, float>);
+
+REGISTER_Q_KERNEL_TYPED(int8_t)
+REGISTER_Q_KERNEL_TYPED(uint8_t)
+
+// register DequantizeLinear kernels
+#define REGISTER_DQ_KERNEL_TYPED(T)                               \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      DequantizeLinear,                                           \
+      kOnnxDomain,                                                \
+      10,                                                         \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      DequantizeLinear<T, float>);
+
+REGISTER_DQ_KERNEL_TYPED(int8_t)
+REGISTER_DQ_KERNEL_TYPED(uint8_t)
+
+// specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal
+#define SPECIALIZED_QDQ_COMPUTE(T, U)                                                \
+  template Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const; \
+  template Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const;
+
+SPECIALIZED_QDQ_COMPUTE(int8_t, float)
+SPECIALIZED_QDQ_COMPUTE(uint8_t, float)
+SPECIALIZED_QDQ_COMPUTE(int8_t, MLFloat16)
+SPECIALIZED_QDQ_COMPUTE(uint8_t, MLFloat16)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "quantize_linear.cuh"
+
+#include <limits>
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+struct Round;
+
+template <>
+struct Round<float> {
+  __device__ __forceinline__ int operator()(float v) const {
+    return __float2int_rn(v);
+  }
+};
+
+template <>
+struct Round<half> {
+  __device__ __forceinline__ int operator()(half v) const {
+    return __half2int_rn(v);
+  }
+};
+
+template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
+__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, HIP_LONG N, Round<InT> round) {
+  HIP_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+
+  InT scale = *scale_ptr;
+  OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      int value = round(input[id] / scale) + zero_point;
+      output[id] = static_cast<OutT>(max(std::numeric_limits<OutT>::min(), min(std::numeric_limits<OutT>::max(), value)));
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <class OutT, class InT>
+Status CudaQuantizeLinear(hipStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      input,
+      output,
+      scale,
+      zero_point,
+      static_cast<int>(num_of_element),
+      Round<InT>());
+  return Status::OK();
+}
+
+template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, HIP_LONG N) {
+  HIP_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+
+  OutT scale = *scale_ptr;
+  InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      output[id] = static_cast<OutT>(input[id] - zero_point) * scale;
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+template <class InT, class OutT>
+Status CudaDequantizeLinear(hipStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(DequantizeLinearKernel<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+      input,
+      output,
+      scale,
+      zero_point,
+      static_cast<int>(num_of_element));
+  return Status::OK();
+}
+
+template Status CudaQuantizeLinear<int8_t, float>(hipStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinear<uint8_t, float>(hipStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinear<int8_t, half>(hipStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinear<uint8_t, half>(hipStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
+
+template Status CudaDequantizeLinear<int8_t, float>(hipStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinear<uint8_t, float>(hipStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinear<int8_t, half>(hipStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinear<uint8_t, half>(hipStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
+
+}  // namespace rocm
+}  // namespace onnxruntime