add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/common/gsl.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/tensor/transpose.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+class Transpose final : public RocmKernel, public TransposeBase {
+ public:
+  Transpose(const OpKernelInfo& info) : RocmKernel(info), TransposeBase(info) {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+  static Status DoTranspose(const Transpose& transpose_kernel,
+                            const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output);
+
+  //  `input_shape_override` (if provided) overrides the shape of `input` for compute purposes
+  //  `output_shape_override` (if provided) overrides the shape of `output` for compute purposes
+  static Status DoTranspose(const hipDeviceProp_t& prop,
+                            hipStream_t stream,
+                            const rocblas_handle rocblas_handle,
+                            const gsl::span<const size_t>& permutations,
+                            const Tensor& input, Tensor& output,
+                            const TensorShape* input_shape_override = nullptr,
+                            const TensorShape* output_shape_override = nullptr);
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "transpose_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+constexpr unsigned int kNumElementsPerThread = 4;
+constexpr unsigned int kTileSize = 32;
+
+// TileSize for current implementation is always 32, but still use template parameter to make it flexible for future.
+// For each batch, transpose matrix [m, n] to [n, m].
+template <typename T, unsigned int TileSize>
+__global__ void Transpose3DKernel(const int64_t m, const int64_t n, const int64_t batch_stride, const T* input_data,
+                                  T* output_data) {
+  __shared__ T tile[TileSize][TileSize + 1];
+
+  int x = blockIdx.x * TileSize + threadIdx.x;
+  int y = blockIdx.y * TileSize + threadIdx.y;
+
+  if (x < n) {
+#pragma unroll
+    for (unsigned int i = 0; i < TileSize; i += (TileSize / kNumElementsPerThread)) {
+      int y_idx = y + i;
+      if (y_idx < m) {
+        tile[threadIdx.y + i][threadIdx.x] = input_data[blockIdx.z * batch_stride + y_idx * n + x];
+      }
+    }
+  }
+  __syncthreads();
+
+  x = blockIdx.y * TileSize + threadIdx.x;
+  y = blockIdx.x * TileSize + threadIdx.y;
+
+  if (x < m) {
+#pragma unroll
+    for (unsigned int i = 0; i < TileSize; i += (TileSize / kNumElementsPerThread)) {
+      int y_idx = y + i;
+      if (y_idx < n) {
+        output_data[blockIdx.z * batch_stride + y_idx * m + x] = tile[threadIdx.x][threadIdx.y + i];
+      }
+    }
+  }
+}
+
+bool CanDoTranspose3D(const hipDeviceProp_t& prop, size_t rank, const gsl::span<const int64_t>& input_dims,
+                      const gsl::span<const size_t>& permutations, dim3& grid_size, dim3& block_size) {
+  // Permutation is done in the last two dimensions.
+  if (rank == 3 && permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2)) {
+    // Normally maxGridSize.x is a large number but maxGridSize.y and maxGridSize.z are limited. Ideally we can check
+    // the input sizes to see if a dimension is too large so that we can use grid.x for it to avoid returning false.
+    // But this requires different versions of kernel implementation with different index compute logics.
+    // Below code is good enough for most of the cases for now, and if we see any case that input_dims[0] or
+    // input_dims[1] is too large in the future, we will handle it accordingly.
+    int grid_size_x = CeilDiv(static_cast<int>(input_dims[2]), kTileSize);
+    int grid_size_y = CeilDiv(static_cast<int>(input_dims[1]), kTileSize);
+    int grid_size_z = static_cast<int>(input_dims[0]);
+
+    if (grid_size_x <= prop.maxGridSize[0] && grid_size_y <= prop.maxGridSize[1] &&
+        grid_size_z <= prop.maxGridSize[2]) {
+      block_size = dim3(kTileSize, kTileSize / kNumElementsPerThread);
+      grid_size = dim3(static_cast<unsigned int>(grid_size_x), static_cast<unsigned int>(grid_size_y),
+                       static_cast<unsigned int>(grid_size_z));
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return false;
+}
+
+#define HANDLE_TRANSPOSE_3D_TILE_DIM(type)                                                                        \
+  case sizeof(type): {                                                                                            \
+    Transpose3DKernel<type, kTileSize>                                                                            \
+        <<<grid_size, block_size, 0, stream>>>(input_shape[1], input_shape[2], input_strides[0],                  \
+                                               reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), \
+                                               reinterpret_cast<ToHipType<type>::MappedType*>(output_data));     \
+  } break
+
+Status Transpose3DImpl(hipStream_t stream, size_t element_size, const TArray<int64_t>& input_shape,
+                       const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t N,
+                       const dim3& grid_size, const dim3& block_size) {
+  switch (element_size) {
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t);
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int16_t);
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int32_t);
+    HANDLE_TRANSPOSE_3D_TILE_DIM(int64_t);
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
+                             element_size);
+  }
+
+  return Status::OK();
+}
+
+template <int element_size>
+__global__ void Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim(
+    const TArray<int64_t> input_strides, const void* input_data,
+    const TArray<int64_t> output_strides, void* output_data,
+    int64_t input_shape_2, HIP_LONG N) {
+  // coordinates will be: [d0, d1, d2, d3]
+  HIP_LONG d0 = blockIdx.z;
+  HIP_LONG d1 = blockIdx.y;
+  HIP_LONG d2 = threadIdx.y + blockIdx.x * blockDim.y;
+  HIP_LONG d3 = threadIdx.x;
+
+  HIP_LONG input_index = (d0 * input_strides[0] +
+                           d1 * input_strides[1] +
+                           d2 * input_strides[2]) /
+                              (4 * sizeof(int) / element_size) +
+                          d3 * input_strides[3];
+
+  HIP_LONG output_index = (d0 * output_strides[0] +
+                            d1 * output_strides[1] +
+                            d2 * output_strides[2]) /
+                               (4 * sizeof(int) / element_size) +
+                           d3 * output_strides[3];
+
+  const int4* v_input = reinterpret_cast<const int4*>(input_data);
+  int4* v_output = reinterpret_cast<int4*>(output_data);
+
+  if (input_index < N && output_index < N && d2 < input_shape_2) {
+    v_output[output_index] = v_input[input_index];
+  }
+}
+
+bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const hipDeviceProp_t& prop,
+                                                                        size_t element_size,
+                                                                        int32_t rank,
+                                                                        const gsl::span<const int64_t>& input_dims,
+                                                                        const gsl::span<const size_t>& permutations,
+                                                                        dim3& grid_size, dim3& block_size) {
+  if (rank == 4 &&
+      // the permutations is not on the last dimension.
+      permutations[3] == 3) {
+    unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast<unsigned int>(element_size);  // int4 is used in the kernel to access data.
+
+    // dims[3]: block.x
+    // dims[2]: block.y + grid.x
+    // dims[1]: grid.y
+    // dims[0]: grid.z
+    if (input_dims[3] / num_elements_per_thread <= prop.maxThreadsPerBlock &&
+        (input_dims[3] % num_elements_per_thread) == 0 &&
+        input_dims[1] <= prop.maxGridSize[1] &&
+        input_dims[0] <= prop.maxGridSize[2]) {
+      // There are 2 constrains when luanching the kernels
+      // 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
+      // 2. block_size_y * num_block_ext >= input_dims[2]
+      int64_t block_size_x = input_dims[3] / num_elements_per_thread;
+      int64_t max_block_size_y = prop.maxThreadsPerBlock / block_size_x;
+      int64_t block_size_y = min(input_dims[2], max_block_size_y);
+      int64_t num_block_ext = CeilDiv(input_dims[2], block_size_y);
+
+      if (num_block_ext <= prop.maxGridSize[0]) {
+        block_size = dim3(static_cast<unsigned int>(block_size_x), static_cast<unsigned int>(block_size_y));
+        grid_size = dim3(static_cast<unsigned int>(num_block_ext),
+                         static_cast<unsigned int>(input_dims[1]),
+                         static_cast<unsigned int>(input_dims[0]));
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+Status Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim(
+    hipStream_t stream, size_t element_size,
+    const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides,
+    const void* input_data, const TArray<int64_t>& output_strides,
+    void* output_data, int N, const dim3& grid_size, const dim3& block_size) {
+  unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast<unsigned int>(element_size);  // int4 is used in the kernel to access data.
+
+  switch (element_size) {
+    case sizeof(int8_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int8_t)>), grid_size, block_size, 0, stream, 
+              input_strides, input_data,
+              output_strides, output_data,
+              input_shape[2],
+              N / num_elements_per_thread);
+      break;
+    case sizeof(int16_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int16_t)>), grid_size, block_size, 0, stream, 
+              input_strides, input_data,
+              output_strides, output_data,
+              input_shape[2],
+              N / num_elements_per_thread);
+      break;
+    case sizeof(int32_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int32_t)>), grid_size, block_size, 0, stream, 
+              input_strides, input_data,
+              output_strides, output_data,
+              input_shape[2],
+              N / num_elements_per_thread);
+      break;
+    case sizeof(int64_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int64_t)>), grid_size, block_size, 0, stream, 
+              input_strides, input_data,
+              output_strides, output_data,
+              input_shape[2],
+              N / num_elements_per_thread);
+      break;
+    default:
+      // User will not hit this as this kernel is for fixed element size tensors only
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
+                             element_size);
+  }
+
+  return Status::OK();
+}
+
+__global__ void Transpose4DKernelParallelizeOneElementPerThread(
+    const TArray<int64_t> input_strides, const int8_t* input_data,
+    const TArray<int64_t> output_strides, int8_t* output_data,
+    size_t element_size, int64_t input_shape_2, HIP_LONG N) {
+  // coordinates will be: [d0, d1, d2, d3]
+  HIP_LONG d0 = blockIdx.z;
+  HIP_LONG d1 = blockIdx.y;
+  HIP_LONG d2 = threadIdx.y + blockIdx.x * blockDim.y;
+  HIP_LONG d3 = threadIdx.x;
+
+  HIP_LONG input_index = d0 * input_strides[0] +
+                          d1 * input_strides[1] +
+                          d2 * input_strides[2] +
+                          d3 * input_strides[3];
+
+  HIP_LONG output_index = d0 * output_strides[0] +
+                           d1 * output_strides[1] +
+                           d2 * output_strides[2] +
+                           d3 * output_strides[3];
+
+  if (input_index < N && output_index < N && d2 < input_shape_2) {
+    const int8_t* input_data_to_be_copied = input_data + (input_index * element_size);
+    int8_t* output_data_to_be_copied = output_data + (output_index * element_size);
+
+    // copy over the bytes
+    for (size_t iter = 0; iter < element_size; ++iter) {
+      *output_data_to_be_copied++ = *input_data_to_be_copied++;
+    }
+  }
+}
+
+bool CanDoTranspose4DParallelizeOneElementPerThread(const hipDeviceProp_t& prop,
+                                                    size_t element_size,
+                                                    int32_t rank,
+                                                    const gsl::span<const int64_t>& input_dims,
+                                                    const gsl::span<const size_t>& permutations,
+                                                    dim3& grid_size, dim3& block_size) {
+  if (rank == 4) {
+    // dims[3]: block.x
+    // dims[2]: block.y + grid.x
+    // dims[1]: grid.y
+    // dims[0]: grid.z
+    if (input_dims[3] <= prop.maxThreadsPerBlock &&
+        input_dims[1] <= prop.maxGridSize[1] &&
+        input_dims[0] <= prop.maxGridSize[2]) {
+      // There are 2 constrains when luanching the kernels
+      // 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
+      // 2. block_size_y * num_block_ext >= input_dims[2]
+      int64_t block_size_x = input_dims[3];
+      int64_t max_block_size_y = prop.maxThreadsPerBlock / block_size_x;
+      int64_t block_size_y = std::min(input_dims[2], max_block_size_y);
+      int64_t num_block_ext = CeilDiv(input_dims[2], block_size_y);
+
+      if (num_block_ext <= prop.maxGridSize[0]) {
+        block_size = dim3(static_cast<unsigned int>(block_size_x), static_cast<unsigned int>(block_size_y));
+        grid_size = dim3(static_cast<unsigned int>(num_block_ext),
+                         static_cast<unsigned int>(input_dims[1]),
+                         static_cast<unsigned int>(input_dims[0]));
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+  return false;
+}
+
+Status Transpose4DParallelizeOneElementPerThread(
+    hipStream_t stream, size_t element_size,
+    const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides,
+    const void* input_data, const TArray<int64_t>& output_strides,
+    void* output_data, int N, const dim3& grid_size, const dim3& block_size) {
+  if (element_size != sizeof(int8_t) &&
+      element_size != sizeof(int16_t) &&
+      element_size != sizeof(int32_t) &&
+      element_size != sizeof(int64_t)) {
+    // User will not hit this as this kernel is for fixed element size tensors only
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
+                           element_size);
+  }
+
+  hipLaunchKernelGGL(Transpose4DKernelParallelizeOneElementPerThread, grid_size, block_size, 0, stream, 
+      input_strides, reinterpret_cast<const int8_t*>(input_data),
+      output_strides, reinterpret_cast<int8_t*>(output_data),
+      element_size, input_shape[2], N);
+
+  return Status::OK();
+}
+
+template <typename T>
+__global__ void TransposeKernel(int32_t shape_rank, const TArray<int64_t> input_strides,
+                                const T* input_data, const TArray<fast_divmod> output_strides, T* output_data, HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  HIP_LONG input_index = 0;
+  HIP_LONG output_index = id;
+
+#pragma unroll
+  for (auto dim = 0; dim < input_strides.Capacity(); ++dim) {
+    if (dim >= shape_rank) {
+      break;
+    }
+    int out_coord, r;
+    output_strides[dim].divmod(output_index, out_coord, r);
+    output_index = r;
+    input_index += input_strides[dim] * out_coord;
+  }
+  output_data[id] = input_data[input_index];
+}
+
+Status TransposeImpl(hipStream_t stream, size_t element_size, int32_t shape_rank, const TArray<int64_t>& input_strides,
+                     const void* input_data, const TArray<fast_divmod>& fdm_output_strides, void* output_data, int N) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  switch (element_size) {
+    case sizeof(int8_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int8_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_strides,
+          reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
+          fdm_output_strides,
+          reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
+          N);
+      break;
+    case sizeof(int16_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int16_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_strides,
+          reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
+          fdm_output_strides,
+          reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
+          N);
+      break;
+    case sizeof(int32_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int32_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_strides,
+          reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
+          fdm_output_strides,
+          reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
+          N);
+      break;
+    case sizeof(int64_t):
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int64_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          shape_rank, input_strides,
+          reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
+          fdm_output_strides,
+          reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
+          N);
+      break;
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
+                             element_size);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+bool CanDoTranspose3D(const hipDeviceProp_t& prop,
+                      size_t rank, const gsl::span<const int64_t>& input_dims, const gsl::span<const size_t>& permutations,
+                      dim3& grid_size, dim3& block_size);
+Status Transpose3DImpl(hipStream_t stream, size_t element_size, const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides, const void* input_data,
+                       void* output_data, int64_t N,
+                       const dim3& grid_size, const dim3& block_size);
+
+bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const hipDeviceProp_t& prop,
+                                                                        size_t element_size,
+                                                                        int32_t rank,
+                                                                        const gsl::span<const int64_t>& input_dims,
+                                                                        const gsl::span<const size_t>& permutations,
+                                                                        dim3& grid_size, dim3& block_size);
+Status Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim(hipStream_t stream,
+                                                                     size_t element_size, const TArray<int64_t>& input_shape,
+                                                                     const TArray<int64_t>& input_strides, const void* input_data,
+                                                                     const TArray<int64_t>& output_strides, void* output_data, int N,
+                                                                     const dim3& grid_size, const dim3& block_size);
+
+bool CanDoTranspose4DParallelizeOneElementPerThread(const hipDeviceProp_t& prop,
+                                                    size_t element_size,
+                                                    int32_t rank,
+                                                    const gsl::span<const int64_t>& input_dims,
+                                                    const gsl::span<const size_t>& permutations,
+                                                    dim3& grid_size, dim3& block_size);
+Status Transpose4DParallelizeOneElementPerThread(hipStream_t stream,
+                                                 size_t element_size, const TArray<int64_t>& input_shape,
+                                                 const TArray<int64_t>& input_strides, const void* input_data,
+                                                 const TArray<int64_t>& output_strides, void* output_data, int N,
+                                                 const dim3& grid_size, const dim3& block_size);
+
+Status TransposeImpl(hipStream_t stream, size_t element_size, int32_t shape_rank, const TArray<int64_t>& input_strides,
+                     const void* input_data, const TArray<fast_divmod>& fdm_output_strides, void* output_data, int N);
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/tensor/trilu.h"
+#include "core/providers/rocm/tensor/trilu_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_KERNEL_EX(
+    Trilu,
+    kOnnxDomain,
+    14,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)
+        .MayInplace(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Trilu);
+
+Status Trilu::ComputeInternal(OpKernelContext* ctx) const {
+  const Tensor* input_ptr = ctx->Input<Tensor>(0);
+  const auto* k = ctx->Input<Tensor>(1);
+
+  int64_t k_val = 0;
+  if (k) {
+    ORT_ENFORCE(IsScalarOr1ElementVector(k), "k should be a 1-D or 0-D tensor.");
+    k_val = *(k->Data<int64_t>());
+  }
+  if (input_ptr == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
+  const Tensor& input = *input_ptr;
+  const auto& shape = input.Shape();
+  const auto& input_dims = shape.GetDims();
+  int32_t rank = gsl::narrow_cast<int32_t>(input_dims.size());
+  if (rank < 2) {
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Input tensor should have a rank of at least 2");
+  }
+  Tensor* output = ctx->Output(0, shape);
+  int64_t matrix_size = input_dims[rank - 1] * input_dims[rank - 2];
+  if (matrix_size == 0) {
+    return Status::OK();
+  }
+  const fast_divmod row_col_divmod_indices(gsl::narrow_cast<int>(input_dims[rank - 1]));
+  const fast_divmod batch_divmod_indices(gsl::narrow_cast<int>(matrix_size));
+
+  size_t element_size = input.DataType()->Size();
+  return TriluImpl(
+      this->Stream(),
+      upper_,
+      element_size,
+      k_val,
+      input.DataRaw(),
+      output->MutableDataRaw(),
+      gsl::narrow<int>(shape.Size()),
+      batch_divmod_indices,
+      row_col_divmod_indices);
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+class Trilu final : public RocmKernel {
+ public:
+  Trilu(const OpKernelInfo& info) : RocmKernel(info), upper_(info.GetAttrOrDefault<int64_t>("upper", 1) >= 1) {
+  }
+  ~Trilu() = default;
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  bool upper_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
\ No newline at end of file
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "trilu_impl.h"
+#include <stdio.h>
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T, bool upper>
+__global__ void TriluKernel(
+    int64_t k,
+    const T* input_data,
+    T* output_data,
+    const HIP_LONG N,
+    const fast_divmod batch_divmod_indices,
+    const fast_divmod row_col_divmod_indices) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+
+  int row, col;
+
+  row_col_divmod_indices.divmod(batch_divmod_indices.mod(id), row, col);
+  output_data[id] = upper ? (((row + k) <= col) ? input_data[id] : 0) : (((row + k) >= col) ? input_data[id] : 0);
+}
+
+Status TriluImpl(
+    hipStream_t stream,
+    bool upper,
+    size_t element_size,
+    int64_t k,
+    const void* input_data,
+    void* output_data,
+    int N,
+    const fast_divmod& batch_divmod_indices,
+    const fast_divmod& row_col_divmod_indices) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  switch (element_size) {
+    case sizeof(int8_t):
+      if (upper) {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int8_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      } else {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int8_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      }
+      break;
+    case sizeof(int16_t):
+      if (upper) {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int16_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      } else {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int16_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      }
+      break;
+    case sizeof(int32_t):
+      if (upper) {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int32_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      } else {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int32_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      }
+      break;
+    case sizeof(int64_t):
+      if (upper) {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int64_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      } else {
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int64_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+            k,
+            reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
+            reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
+            (HIP_LONG)N,
+            batch_divmod_indices,
+            row_col_divmod_indices);
+      }
+      break;
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
+                             element_size);
+  }
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
\ No newline at end of file
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+Status TriluImpl(
+    hipStream_t stream,
+    bool upper,
+    size_t element_size,
+    int64_t k,
+    const void* input_data,
+    void* output_data,
+    int N,
+    const fast_divmod& batch_divmod_indices,
+    const fast_divmod& row_col_divmod_indices);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/tensor/unsqueeze.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Unsqueeze,
+    kOnnxDomain,
+    1, 10,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Unsqueeze);
+
+// explicitly support negative axis
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Unsqueeze,
+    kOnnxDomain,
+    11, 12,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Unsqueeze);
+
+// axes is input instead of attribute, support bfloat16
+ONNX_OPERATOR_KERNEL_EX(
+    Unsqueeze,
+    kOnnxDomain,
+    13,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .Alias(0, 0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
+    Unsqueeze);
+
+Status Unsqueeze::ComputeInternal(OpKernelContext* ctx) const {
+  Prepare p;
+  ORT_RETURN_IF_ERROR(PrepareCompute(ctx, p));
+
+  const void* input = p.input_tensor->DataRaw();
+  void* output = p.output_tensor->MutableDataRaw();
+  if (input == output)
+    return Status::OK();
+
+  auto count = p.input_tensor->Shape().Size();
+  auto element_bytes = p.input_tensor->DataType()->Size();
+  HIP_RETURN_IF_ERROR(hipMemcpyAsync(output, input, count * element_bytes, hipMemcpyDeviceToDevice, Stream()));
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/tensor/unsqueeze.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+class Unsqueeze final : public UnsqueezeBase, public RocmKernel {
+ public:
+  Unsqueeze(const OpKernelInfo& info) : UnsqueezeBase(info), RocmKernel(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "upsample.h"
+#include "upsample_impl.h"
+#include "core/providers/rocm/tensor/resize_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+namespace rocm {
+
+#define REGISTER_VERSIONED_TYPED_KERNEL(T, start, end)            \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Upsample,                                                   \
+      kOnnxDomain,                                                \
+      start,                                                      \
+      end,                                                        \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .InputMemoryType(OrtMemTypeCPUInput, 1)                 \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Upsample<T>)
+
+REGISTER_VERSIONED_TYPED_KERNEL(float, 7, 8);
+REGISTER_VERSIONED_TYPED_KERNEL(double, 7, 8);
+REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 7, 8);
+REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 7, 8);
+REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 7, 8);
+
+// Upsample was deprecated in opset 10
+REGISTER_VERSIONED_TYPED_KERNEL(float, 9, 9);
+REGISTER_VERSIONED_TYPED_KERNEL(double, 9, 9);
+REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 9, 9);
+REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
+REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
+
+template <typename T>
+Status Upsample<T>::BaseCompute(OpKernelContext* context,
+                                const std::vector<float>& roi,
+                                const std::vector<float>& scales,
+                                const gsl::span<const int64_t>& output_dims) const {
+  const Tensor* X = context->Input<Tensor>(0);
+  auto X_dims = X->Shape().GetDims();
+  int32_t rank = static_cast<int32_t>(X_dims.size());
+
+  ORT_ENFORCE(static_cast<int32_t>(output_dims.size()) == rank, "Rank of input and output tensor should be same.");
+  if (rank == 0)
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT,
+                  is_resize_ ? "Resize: input tensor cannot be scalar." : "Upsample: input tensor cannot be scalar.");
+  if (rank != static_cast<int32_t>(scales.size()))
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT,
+                  is_resize_ ? "Resize: input tensor's dimension does not match the scales." : "Upsample: input tensor's dimension does not match the scales.");
+  if (roi.size() != 2 * X->Shape().GetDims().size())
+    return Status(ONNXRUNTIME, INVALID_ARGUMENT,
+                  "Resize: size of roi array should be 2 * N where N is the rank of input tensor X.");
+
+  Tensor* Y = context->Output(0, output_dims);
+
+  // Return early if the output tensor is going to be of size 0
+  if (Y->Shape().Size() == 0) {
+    return Status::OK();
+  }
+
+  typedef typename ToHipType<T>::MappedType HipT;
+
+  // kernel
+  TensorPitches input_pitches(X_dims);
+  TArray<int64_t> input_strides(input_pitches);
+
+  TensorPitches output_pitches(output_dims);
+  TArray<fast_divmod> output_div_pitches(rank);
+
+  for (int32_t i = 0; i < rank; ++i) {
+    output_div_pitches[i] = fast_divmod(gsl::narrow_cast<int>(output_pitches[i]));
+  }
+  size_t output_count = Y->Shape().Size();
+
+  if (is_resize_) {
+    TArray<int64_t> input_shape(X_dims);
+    TArray<int64_t> output_shape(output_dims);
+    TArray<float, 10> roi_vals(roi);
+    TArray<float> scales_vals(scales);
+
+    size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
+    auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size);
+    void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
+    ResizeImpl(Stream(), mode_, (int)rank, input_shape, output_shape,
+               input_strides, output_div_pitches, scales_vals, roi_vals,
+               reinterpret_cast<const HipT*>(X->Data<T>()),
+               reinterpret_cast<HipT*>(Y->MutableData<T>()),
+               output_count, use_extrapolation_, ToHipType<T>::FromFloat(extrapolation_value_),
+               cubic_coeff_a_, exclude_outside_,
+               coordinate_transform_mode_, nearest_mode_,
+               dims_mapping);
+  } else {
+    TArray<fast_divmod> scales_div(rank);
+
+    for (int32_t i = 0; i < rank; ++i) {
+      scales_div[i] = fast_divmod(gsl::narrow_cast<int>(ceil(scales[i])));
+    }
+
+    UpampleImpl(Stream(),
+                mode_,
+                rank,
+                (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0,
+                input_strides,
+                output_div_pitches,
+                scales_div,
+                reinterpret_cast<const HipT*>(X->Data<T>()),
+                reinterpret_cast<HipT*>(Y->MutableData<T>()),
+                output_count);
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* X = context->Input<Tensor>(0);
+  ORT_ENFORCE(X != nullptr);
+
+  TensorShapeVector output_dims(X->Shape().GetDims().size());
+  std::vector<float> roi_array(X->Shape().GetDims().size() * 2, 0.0f);
+  if (!roi_cached_) {
+    bool use_default_roi = true;
+    if (need_roi_input_) {
+      ORT_ENFORCE(roi_input_idx_ > 0, "Invalid roi input index.");
+      const auto* roi = context->Input<Tensor>(roi_input_idx_);
+      if (roi != nullptr) {
+        ParseRoiData(roi, roi_array);
+        use_default_roi = false;
+      }
+    }
+    if (use_default_roi) {
+      // default roi includes ensures all the values in that axis are included in the roi
+      // normalized roi is thus : [start, end] = [0, 1]
+      const auto input_dims = X->Shape().GetDims();
+      size_t input_rank = input_dims.size();
+      roi_array.resize(input_rank * 2);
+      for (size_t i = 0; i < input_rank; ++i) {
+        roi_array[i] = 0;
+        roi_array[i + input_rank] = 1;
+      }
+    }
+  }
+
+  const std::vector<float>& roi = roi_cached_ ? roi_ : roi_array;
+
+  if (OpKernel::Node().InputDefs().size() == 1) {
+    // Compute output shape from scales and input dims
+    ComputeOutputShape(scales_, X->Shape().GetDims(), output_dims);
+    return BaseCompute(context, roi, scales_, output_dims);
+  }
+
+  const Tensor* scales = context->Input<Tensor>(scales_input_idx_);
+  const Tensor* sizes = context->Input<Tensor>(sizes_input_idx_);
+
+  if (scales_cached_) {
+    ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    ComputeOutputShape(scales_, X->Shape().GetDims(), output_dims);
+    return BaseCompute(context, roi, scales_, output_dims);
+  }
+
+  std::vector<float> scales_array(X->Shape().GetDims().size());
+  if (scales != nullptr && scales->Shape().Size() != 0) {
+    // use scales input data
+    ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
+    ParseScalesData(scales, scales_array);
+    ComputeOutputShape(scales_array, X->Shape().GetDims(), output_dims);
+  } else {
+    // When sizes input is available directly populate it into the output_dims array.
+    ORT_ENFORCE(sizes != nullptr && sizes->Shape().Size() != 0,
+                "Either scales or sizes MUST be provided as input.");
+    ORT_ENFORCE(sizes->Shape().Size() == static_cast<int64_t>(output_dims.size()),
+                "Resize: input tensor's rank does not match the output tensor's rank.");
+    memcpy(output_dims.data(), sizes->Data<int64_t>(), sizes->Shape().Size() * sizeof(int64_t));
+    ParseScalesDataFromOutputSize(output_dims, X->Shape().GetDims(), scales_array);
+  }
+
+  return BaseCompute(context, roi, scales_array, output_dims);
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class Upsample : public UpsampleBase, public RocmKernel {
+ public:
+  Upsample(const OpKernelInfo& info) : UpsampleBase(info), RocmKernel(info) {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+  Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
+                     const gsl::span<const int64_t>& output_dims) const;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "upsample_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T, int RANK>
+__global__ void _UpampleNearestKernel(const TArray<int64_t> input_pitches,
+                                      const TArray<fast_divmod> output_div_pitches,
+                                      const TArray<fast_divmod> scales_div,
+                                      const T* __restrict__ input_data,
+                                      T* __restrict__ output_data,
+                                      const size_t N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  HIP_LONG input_index = 0;
+  HIP_LONG output_index = id;
+
+  int div, mod;
+  for (int dim = 0; dim < RANK; ++dim) {
+    output_div_pitches[dim].divmod(output_index, div, mod);
+    output_index = mod;
+    if (scales_div[dim].d_ != 1 && div > 0) {
+      scales_div[dim].divmod(div, div, mod);
+    }
+    input_index += input_pitches[dim] * div;
+  }
+  output_data[id] = input_data[input_index];
+}
+
+// The following method supports a 4-D input in 'Linear mode'
+// that amounts to 'Bilinear' Upsampling/Resizing in the sense that it assumes
+// the scale values for the outermost 2 dimensions are 1.
+// This is the common use-case where the 4-D input (batched multi-channel images)
+// is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
+template <typename T>
+__global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2,
+                                              const TArray<int64_t> input_pitches,
+                                              const TArray<fast_divmod> output_div_pitches,
+                                              const TArray<fast_divmod> scales_div,
+                                              const T* __restrict__ input_data,
+                                              T* __restrict__ output_data,
+                                              const size_t N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  HIP_LONG input_index = 0;
+
+  // For bilinear mode, scales[0]=scales[1]=1
+  int mod;
+  int index_of_dim0, index_of_dim1, index_of_dim2, index_of_dim3;
+  output_div_pitches[0].divmod(id, index_of_dim0, mod);
+  output_div_pitches[1].divmod(mod, index_of_dim1, mod);
+  output_div_pitches[2].divmod(mod, index_of_dim2, mod);
+  index_of_dim3 = mod;
+  int index_of_input_dim2, index_of_input_dim3, x_offset, y_offset;
+  scales_div[2].divmod(index_of_dim2, index_of_input_dim2, y_offset);
+  scales_div[3].divmod(index_of_dim3, index_of_input_dim3, x_offset);
+
+  input_index = index_of_dim0 * input_pitches[0] +
+                index_of_dim1 * input_pitches[1] +
+                index_of_input_dim2 * input_pitches[2] +
+                index_of_input_dim3;
+
+  T x00 = input_data[input_index];
+  T x10, x01, x11;
+
+  bool end_of_dim2 = false;
+  if (index_of_input_dim2 == (input_dim2 - 1)) {
+    // It's the end in dimension 2
+    x01 = x00;
+    end_of_dim2 = true;
+  } else {
+    x01 = input_data[input_index + input_pitches[2]];
+  }
+
+  if (index_of_input_dim3 == (input_pitches[2] - 1)) {
+    // It's the end in dimension 3
+    x10 = x00;
+    x11 = x01;
+  } else {
+    x10 = input_data[input_index + 1];
+    x11 = end_of_dim2 ? x10 : input_data[input_index + input_pitches[2] + 1];
+  }
+
+  T y_offset_T = static_cast<T>(y_offset);
+  T x_offset_T = static_cast<T>(x_offset);
+  T scales_div2_T = static_cast<T>(scales_div[2].d_);
+  T scales_div3_T = static_cast<T>(scales_div[3].d_);
+  T y0 = x00 + static_cast<T>(y_offset_T * (x01 - x00) / scales_div2_T);
+  T y1 = x10 + static_cast<T>(y_offset_T * (x11 - x10) / scales_div2_T);
+
+  output_data[id] = y0 + static_cast<T>(x_offset_T * (y1 - y0) / scales_div3_T);
+}
+
+// The following method supports a 2-D input in 'Linear mode'
+template <typename T>
+__global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0,
+                                              const TArray<int64_t> input_pitches,
+                                              const TArray<fast_divmod> output_div_pitches,
+                                              const TArray<fast_divmod> scales_div,
+                                              const T* __restrict__ input_data,
+                                              T* __restrict__ output_data,
+                                              const size_t N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  HIP_LONG input_index = 0;
+
+  int mod;
+  int index_of_dim0, index_of_dim1;
+  output_div_pitches[0].divmod(id, index_of_dim0, mod);
+  index_of_dim1 = mod;
+  int index_of_input_dim0, index_of_input_dim1, x_offset, y_offset;
+  scales_div[0].divmod(index_of_dim0, index_of_input_dim0, y_offset);
+  scales_div[1].divmod(index_of_dim1, index_of_input_dim1, x_offset);
+
+  input_index = index_of_input_dim0 * input_pitches[0] + index_of_input_dim1;
+
+  T x00 = input_data[input_index];
+  T x10, x01, x11;
+
+  bool end_of_dim0 = false;
+  if (index_of_input_dim0 == (input_dim0 - 1)) {
+    // It's the end in dimension 0
+    x01 = x00;
+    end_of_dim0 = true;
+  } else {
+    x01 = input_data[input_index + input_pitches[0]];
+  }
+
+  if (index_of_input_dim1 == (input_pitches[0] - 1)) {
+    // It's the end in dimension 1
+    x10 = x00;
+    x11 = x01;
+  } else {
+    x10 = input_data[input_index + 1];
+    x11 = end_of_dim0 ? x10 : input_data[input_index + input_pitches[0] + 1];
+  }
+
+  T y_offset_T = static_cast<T>(y_offset);
+  T x_offset_T = static_cast<T>(x_offset);
+  T scales_div0_T = static_cast<T>(scales_div[0].d_);
+  T scales_div1_T = static_cast<T>(scales_div[1].d_);
+  T y0 = x00 + static_cast<T>(y_offset_T * (x01 - x00) / scales_div0_T);
+  T y1 = x10 + static_cast<T>(y_offset_T * (x11 - x10) / scales_div0_T);
+
+  output_data[id] = y0 + static_cast<T>(x_offset_T * (y1 - y0) / scales_div1_T);
+}
+
+template <typename T>
+void UpampleImpl(hipStream_t stream,
+                 const onnxruntime::UpsampleMode upsample_mode,
+                 const size_t rank,
+                 const int64_t input_dim2,
+                 const TArray<int64_t>& input_pitches,
+                 const TArray<fast_divmod>& output_div_pitches,
+                 const TArray<fast_divmod>& scales_div,
+                 const T* input_data,
+                 T* output_data,
+                 const size_t N) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  if (onnxruntime::UpsampleMode::NN == upsample_mode) {
+    if (rank == 4) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 4>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          input_pitches, output_div_pitches, scales_div,
+          input_data, output_data, N);
+    } else if (rank == 3) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 3>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          input_pitches, output_div_pitches, scales_div,
+          input_data, output_data, N);
+    } else if (rank == 2) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          input_pitches, output_div_pitches, scales_div,
+          input_data, output_data, N);
+    } else if (rank == 1) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 1>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          input_pitches, output_div_pitches, scales_div,
+          input_data, output_data, N);
+    } else {
+      ORT_THROW("Unsupported rank by the Upsample ROCM kernel. Input rank: ", rank);
+    }
+  } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode) {
+    if (rank == 4) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleBilinear4DInputKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          input_dim2, input_pitches, output_div_pitches, scales_div,
+          input_data, output_data, N);
+    } else if (rank == 2) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleBilinear2DInputKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          input_dim2, input_pitches, output_div_pitches, scales_div,
+          input_data, output_data, N);
+    } else {
+      ORT_THROW("Unsupported rank by the Upsample ROCM kernel. Input rank: ", rank);
+    }
+  } else {
+    // Should never encounter this as Upsample only supports 'Nearest' and 'Linear' modes.
+    // But if we do encounter this it is best to throw instead of returning silently.
+    ORT_THROW("Unsupported mode for Upsample: ", upsample_mode);
+  }
+}
+
+#define SPECIALIZED_IMPL(T)                                                   \
+  template void UpampleImpl<T>(hipStream_t stream,                           \
+                               const onnxruntime::UpsampleMode upsample_mode, \
+                               const size_t rank,                             \
+                               const int64_t input_dim2,                      \
+                               const TArray<int64_t>& input_pitches,          \
+                               const TArray<fast_divmod>& output_div_pitches, \
+                               const TArray<fast_divmod>& scales_div,         \
+                               const T* input_data,                           \
+                               T* output_data,                                \
+                               const size_t N);
+
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double)
+SPECIALIZED_IMPL(half)
+SPECIALIZED_IMPL(int32_t)
+SPECIALIZED_IMPL(uint8_t)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/common/common.h"
+#include "core/providers/cpu/tensor/upsamplebase.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+void UpampleImpl(hipStream_t stream,
+                 const onnxruntime::UpsampleMode upsample_mode,
+                 const size_t rank,
+                 const int64_t input_dim2,
+                 const TArray<int64_t>& input_pitches,
+                 const TArray<fast_divmod>& output_div_pitches,
+                 const TArray<fast_divmod>& scales_div,
+                 const T* input_data,
+                 T* output_data,
+                 const size_t N);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "where.h"
+#include "where_impl.h"
+#include "core/providers/cpu/tensor/utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// kernel builder functions
+#define WHERE_TYPED_KERNEL_WITH_TYPE_NAME(T, TName)                 \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                          \
+      Where,                                                        \
+      kOnnxDomain,                                                  \
+      9,                                                            \
+      15,                                                           \
+      TName,                                                        \
+      kRocmExecutionProvider,                                       \
+      (*KernelDefBuilder::Create())                                 \
+          .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>()) \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),   \
+      Where<T>);                                                    \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                    \
+      Where,                                                        \
+      kOnnxDomain,                                                  \
+      16,                                                           \
+      TName,                                                        \
+      kRocmExecutionProvider,                                       \
+      (*KernelDefBuilder::Create())                                 \
+          .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>()) \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),   \
+      Where<T>);
+
+// Compute where operator output shape based upon three way broad-casting.
+Status ComputeOutputShape(const std::string& node_name, const TensorShape& cond_shape,
+                          const TensorShape& x_shape, const TensorShape& y_shape, TensorShape& out_shape) {
+  size_t cond_rank = cond_shape.NumDimensions();
+  size_t x_rank = x_shape.NumDimensions();
+  size_t y_rank = y_shape.NumDimensions();
+  size_t out_rank = std::max(std::max(cond_rank, x_rank), y_rank);
+
+  std::vector<int64_t> output_dims(out_rank, 0);
+  for (size_t i = 0; i < out_rank; ++i) {
+    int64_t cond_dim = 1;
+    if (i < cond_rank)
+      cond_dim = cond_shape[cond_rank - 1 - i];
+
+    int64_t x_dim = 1;
+    if (i < x_rank)
+      x_dim = x_shape[x_rank - 1 - i];
+
+    int64_t y_dim = 1;
+    if (i < y_rank)
+      y_dim = y_shape[y_rank - 1 - i];
+
+    int64_t out_dim = std::max(std::max(cond_dim, x_dim), y_dim);
+    // special case to handle a dim of 0 which can be broadcast with a 1
+    if (out_dim == 1)
+      out_dim = std::min(std::min(cond_dim, x_dim), y_dim);
+
+    if (cond_dim != out_dim && cond_dim != 1)
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_name, ": condition operand cannot broadcast on dim ", cond_rank - 1 - i,
+                             " Condition Shape: ", cond_shape.ToString(), ", X Shape: ", x_shape.ToString(), ", Y Shape: ", y_shape.ToString());
+    if (x_dim != out_dim && x_dim != 1)
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_name, ": X operand cannot broadcast on dim ", x_rank - 1 - i,
+                             " Condition Shape: ", cond_shape.ToString(), ", X Shape: ", x_shape.ToString(), ", Y Shape: ", y_shape.ToString());
+    if (y_dim != out_dim && y_dim != 1)
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_name, ": Y operand cannot broadcast on dim ", y_rank - 1 - i,
+                             " Condition Shape: ", cond_shape.ToString(), ", X Shape: ", x_shape.ToString(), ", Y Shape: ", y_shape.ToString());
+    output_dims[out_rank - 1 - i] = out_dim;
+  }
+
+  out_shape = TensorShape(output_dims);
+  return Status::OK();
+}
+
+struct TernaryElementwisePreparation {
+  const Tensor* a_tensor = nullptr;
+  const Tensor* b_tensor = nullptr;
+  const Tensor* c_tensor = nullptr;
+  size_t output_rank_or_simple_broadcast = 0;  // for no_broadcast cases, output_rank uses SimpleBroadcast enums
+  TArray<int64_t> a_padded_strides;            // for a shape == output shape, this is nullptr
+  TArray<int64_t> b_padded_strides;            // for b shape == output shape, this is nullptr
+  TArray<int64_t> c_padded_strides;            // for c shape == output shape, this is nullptr
+  TArray<fast_divmod> fdm_output_strides;
+  BroadcastIndexType a_index_type = BroadcastIndexType::NoBroadcast;
+  BroadcastIndexType b_index_type = BroadcastIndexType::NoBroadcast;
+  BroadcastIndexType c_index_type = BroadcastIndexType::NoBroadcast;
+
+  TernaryElementwisePreparation(const Tensor* a, const Tensor* b, const Tensor* c)
+      : a_tensor(a), b_tensor(b), c_tensor(c) {}
+
+  Status TernaryElementwiseBroadcastPrepareHelper(const TensorShape& a_shape,
+                                                  const TensorShape& b_shape,
+                                                  const TensorShape& c_shape,
+                                                  const TensorShape& output_shape) {
+    int32_t a_rank = static_cast<int32_t>(a_shape.NumDimensions());
+    int32_t b_rank = static_cast<int32_t>(b_shape.NumDimensions());
+    int32_t c_rank = static_cast<int32_t>(c_shape.NumDimensions());
+    int32_t out_rank = std::max(std::max(a_rank, b_rank), c_rank);
+
+    // early return when shapes match
+    if (a_shape == b_shape && b_shape == c_shape) {
+      output_rank_or_simple_broadcast = static_cast<size_t>(SimpleBroadcast::NoBroadcast);
+      return Status::OK();
+    }
+
+    output_rank_or_simple_broadcast = out_rank;
+
+    auto padder = [out_rank](int32_t rank, const TensorShape& shape, TArray<int64_t>& padded_strides) {
+      padded_strides.SetSize(out_rank);
+      if (rank > 0) {
+        TensorPitches pitches(shape.GetDims());
+        auto offset = out_rank - rank;
+        for (auto i = offset; i < out_rank; ++i) {
+          // the stride for broadcast dimension is kept as 0
+          if (shape.GetDims()[gsl::narrow_cast<size_t>(i) - offset] != 1) {
+            padded_strides[i] = pitches[gsl::narrow_cast<size_t>(i) - offset];
+          }
+        }
+      }
+    };
+
+    bool has_need_compute = false;
+    if (a_shape.Size() == 1) {
+      a_index_type = BroadcastIndexType::Scalar;
+    } else if (a_shape != output_shape) {
+      padder(a_rank, a_shape, a_padded_strides);
+      a_index_type = BroadcastIndexType::NeedCompute;
+      has_need_compute = true;
+    }
+
+    if (b_shape.Size() == 1) {
+      b_index_type = BroadcastIndexType::Scalar;
+    } else if (b_shape != output_shape) {
+      padder(b_rank, b_shape, b_padded_strides);
+      b_index_type = BroadcastIndexType::NeedCompute;
+      has_need_compute = true;
+    }
+
+    if (c_shape.Size() == 1) {
+      c_index_type = BroadcastIndexType::Scalar;
+    } else if (c_shape != output_shape) {
+      padder(c_rank, c_shape, c_padded_strides);
+      c_index_type = BroadcastIndexType::NeedCompute;
+      has_need_compute = true;
+    }
+
+    if (!has_need_compute) {
+      output_rank_or_simple_broadcast = static_cast<size_t>(SimpleBroadcast::NoBroadcast);
+      return Status::OK();
+    }
+
+    TensorPitches output_pitches(output_shape.GetDims());
+    fdm_output_strides.SetSize(out_rank);
+    for (auto i = 0; i < out_rank; ++i) {
+      fdm_output_strides[i] = fast_divmod(static_cast<int32_t>(output_pitches[i]));
+    }
+
+    return Status::OK();
+  }
+};
+
+template <typename T>
+Status Where<T>::ComputeInternal(OpKernelContext* context) const {
+  typedef typename ToHipType<T>::MappedType HipT;
+  const auto* const condition = context->Input<Tensor>(0);
+  const auto* const X = context->Input<Tensor>(1);
+  const auto* const Y = context->Input<Tensor>(2);
+  ORT_ENFORCE(condition && X && Y, "condition, X, and Y inputs are required!");
+
+  auto const& condition_shape = condition->Shape();
+  auto const& X_shape = X->Shape();
+  auto const& Y_shape = Y->Shape();
+
+  TensorShape output_shape;
+  ORT_RETURN_IF_ERROR(ComputeOutputShape(Node().Name(), condition_shape, X_shape, Y_shape, output_shape));
+  auto output_tensor = context->Output(0, output_shape);
+
+  if (output_shape.Size() == 0)
+    return Status::OK();
+
+  TernaryElementwisePreparation prepare(condition, X, Y);
+  ORT_RETURN_IF_ERROR(prepare.TernaryElementwiseBroadcastPrepareHelper(condition_shape, X_shape, Y_shape, output_shape));
+
+  WhereImpl<HipT>(
+      Stream(),
+      prepare.output_rank_or_simple_broadcast,
+      prepare.a_index_type,
+      prepare.a_padded_strides,
+      reinterpret_cast<const bool*>(prepare.a_tensor->Data<bool>()),
+      prepare.b_index_type,
+      prepare.b_padded_strides,
+      reinterpret_cast<const HipT*>(prepare.b_tensor->Data<T>()),
+      prepare.c_index_type,
+      prepare.c_padded_strides,
+      reinterpret_cast<const HipT*>(prepare.c_tensor->Data<T>()),
+      prepare.fdm_output_strides,
+      reinterpret_cast<HipT*>(output_tensor->MutableData<T>()),
+      output_tensor->Shape().Size());
+
+  return Status::OK();
+}
+
+#define SPECIALIZED_COMPUTE_WITH_NAME(T, TName) \
+  WHERE_TYPED_KERNEL_WITH_TYPE_NAME(T, TName)   \
+  template Status Where<T>::ComputeInternal(OpKernelContext* context) const;
+
+#define SPECIALIZED_COMPUTE(T) \
+  SPECIALIZED_COMPUTE_WITH_NAME(T, T)
+
+SPECIALIZED_COMPUTE(uint8_t)
+SPECIALIZED_COMPUTE(int32_t)
+SPECIALIZED_COMPUTE(int64_t)
+SPECIALIZED_COMPUTE(float)
+SPECIALIZED_COMPUTE(double_t)
+SPECIALIZED_COMPUTE(MLFloat16)
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class Where final : public RocmKernel {
+ public:
+  Where(const OpKernelInfo& info) : RocmKernel(info) {}
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef __GNUC__
+#include "onnxruntime_config.h"
+#pragma GCC diagnostic ignored "-Wswitch"
+#endif
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "where_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// broadcast by computing output coordinate from offset, using fast_divmod
+template <typename T, BroadcastIndexType CondIndexType, BroadcastIndexType XIndexType, BroadcastIndexType YIndexType, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _TenaryElementWise(
+    size_t output_rank,
+    const TArray<int64_t> cond_padded_strides,
+    const bool* cond_data,
+    const TArray<int64_t> x_padded_strides,
+    const T* x_data,
+    const TArray<int64_t> y_padded_strides,
+    const T* y_data,
+    const TArray<fast_divmod> fdm_output_strides,
+    T* output_data,
+    HIP_LONG N) {
+  HIP_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  bool cond_value[NumElementsPerThread];
+  T x_value[NumElementsPerThread];
+  T y_value[NumElementsPerThread];
+
+  HIP_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      // compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+      HIP_LONG cond_index = (CondIndexType == BroadcastIndexType::NoBroadcast ? id : 0);
+      HIP_LONG x_index = (XIndexType == BroadcastIndexType::NoBroadcast ? id : 0);
+      HIP_LONG y_index = (YIndexType == BroadcastIndexType::NoBroadcast ? id : 0);
+      HIP_LONG offset = id;
+#pragma unroll
+      for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) {
+        if (dim >= output_rank) {
+          break;
+        }
+
+        int q, r;
+        fdm_output_strides[dim].divmod(offset, q, r);
+
+        if (CondIndexType == BroadcastIndexType::NeedCompute) {
+          cond_index += static_cast<int>(cond_padded_strides[dim]) * q;
+        }
+
+        if (XIndexType == BroadcastIndexType::NeedCompute) {
+          x_index += static_cast<int>(x_padded_strides[dim]) * q;
+        }
+
+        if (YIndexType == BroadcastIndexType::NeedCompute) {
+          y_index += static_cast<int>(y_padded_strides[dim]) * q;
+        }
+
+        offset = r;
+      }
+
+      cond_value[i] = cond_data[cond_index];
+      x_value[i] = x_data[x_index];
+      y_value[i] = y_data[y_index];
+      id += NumThreadsPerBlock;
+    }
+  }
+
+  id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      output_data[id] = cond_value[i] ? x_value[i] : y_value[i];
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+// for scalar broadcast or non-broadcast case
+template <typename T, BroadcastIndexType CondIndexType, BroadcastIndexType XIndexType, BroadcastIndexType YIndexType, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void _TenaryElementWiseSimple(
+    const bool* cond_data,
+    const T* x_data,
+    const T* y_data,
+    T* output_data,
+    HIP_LONG N) {
+  HIP_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
+  bool cond_value[NumElementsPerThread];
+  T x_value[NumElementsPerThread];
+  T y_value[NumElementsPerThread];
+
+  HIP_LONG id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      cond_value[i] = cond_data[CondIndexType == BroadcastIndexType::NoBroadcast ? id : 0];
+      x_value[i] = x_data[XIndexType == BroadcastIndexType::NoBroadcast ? id : 0];
+      y_value[i] = y_data[YIndexType == BroadcastIndexType::NoBroadcast ? id : 0];
+      id += NumThreadsPerBlock;
+    }
+  }
+
+  id = start;
+#pragma unroll
+  for (int i = 0; i < NumElementsPerThread; i++) {
+    if (id < N) {
+      output_data[id] = cond_value[i] ? x_value[i] : y_value[i];
+      id += NumThreadsPerBlock;
+    }
+  }
+}
+
+#define HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \
+  case Y_INDEX_TYPE: {                                                          \
+    _TenaryElementWiseSimple<T,                                                 \
+                             COND_INDEX_TYPE,                                   \
+                             X_INDEX_TYPE,                                      \
+                             Y_INDEX_TYPE,                                      \
+                             GridDim::maxThreadsPerBlock,                       \
+                             GridDim::maxElementsPerThread>                     \
+        <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(cond_data,  \
+                                                            x_data,             \
+                                                            y_data,             \
+                                                            output_data,        \
+                                                            N);                 \
+  } break
+
+#define HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL)               \
+  case X_INDEX_TYPE: {                                                                            \
+    switch (Y_INDEX_TYPE_VAL) {                                                                   \
+      HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \
+      HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar);      \
+    }                                                                                             \
+  } break
+
+#define HANDLE_COND_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL)            \
+  case COND_INDEX_TYPE: {                                                                             \
+    switch (X_INDEX_TYPE_VAL) {                                                                       \
+      HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \
+      HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL);      \
+    }                                                                                                 \
+  } break
+
+#define HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE)                     \
+  case Y_INDEX_TYPE: {                                                                       \
+    _TenaryElementWise<T,                                                                    \
+                       COND_INDEX_TYPE,                                                      \
+                       X_INDEX_TYPE,                                                         \
+                       Y_INDEX_TYPE,                                                         \
+                       GridDim::maxThreadsPerBlock,                                          \
+                       GridDim::maxElementsPerThread>                                        \
+        <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output_rank_or_simple_broadcast, \
+                                                            cond_padded_strides,             \
+                                                            cond_data,                       \
+                                                            x_padded_strides,                \
+                                                            x_data,                          \
+                                                            y_padded_strides,                \
+                                                            y_data,                          \
+                                                            fdm_output_strides,              \
+                                                            output_data,                     \
+                                                            N);                              \
+  } break
+
+#define HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL)               \
+  case X_INDEX_TYPE: {                                                                     \
+    switch (Y_INDEX_TYPE_VAL) {                                                            \
+      HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \
+      HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar);      \
+      HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NeedCompute); \
+    }                                                                                      \
+  } break
+
+#define HANDLE_COND_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL)            \
+  case COND_INDEX_TYPE: {                                                                      \
+    switch (X_INDEX_TYPE_VAL) {                                                                \
+      HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \
+      HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL);      \
+      HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NeedCompute, Y_INDEX_TYPE_VAL); \
+    }                                                                                          \
+  } break
+
+template <typename T>
+void WhereImpl(
+    hipStream_t stream,
+    size_t output_rank_or_simple_broadcast,
+    BroadcastIndexType cond_index_type,
+    const TArray<int64_t>& cond_padded_strides,
+    const bool* cond_data,
+    BroadcastIndexType x_index_type,
+    const TArray<int64_t>& x_padded_strides,
+    const T* x_data,
+    BroadcastIndexType y_index_type,
+    const TArray<int64_t>& y_padded_strides,
+    const T* y_data,
+    const TArray<fast_divmod>& fdm_output_strides,
+    T* output_data,
+    size_t count) {
+  int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  HIP_LONG N = static_cast<HIP_LONG>(count);
+  if (output_rank_or_simple_broadcast == static_cast<size_t>(SimpleBroadcast::NoBroadcast)) {
+    switch (cond_index_type) {
+      HANDLE_COND_INDEX_TYPE_SIMPLE(BroadcastIndexType::NoBroadcast, x_index_type, y_index_type);
+      HANDLE_COND_INDEX_TYPE_SIMPLE(BroadcastIndexType::Scalar, x_index_type, y_index_type);
+    }
+  } else {
+    switch (cond_index_type) {
+      HANDLE_COND_INDEX_TYPE(BroadcastIndexType::NoBroadcast, x_index_type, y_index_type);
+      HANDLE_COND_INDEX_TYPE(BroadcastIndexType::Scalar, x_index_type, y_index_type);
+      HANDLE_COND_INDEX_TYPE(BroadcastIndexType::NeedCompute, x_index_type, y_index_type);
+    }
+  }
+}
+
+#define SPECIALIZED_IMPL(T)                                                 \
+  template void WhereImpl<T>(hipStream_t stream,                           \
+                             size_t output_rank_or_simple_broadcast,        \
+                             BroadcastIndexType cond_index_type,            \
+                             const TArray<int64_t>& cond_padded_strides,    \
+                             const bool* cond_data,                         \
+                             BroadcastIndexType x_index_type,               \
+                             const TArray<int64_t>& x_padded_strides,       \
+                             const T* x_data,                               \
+                             BroadcastIndexType y_index_type,               \
+                             const TArray<int64_t>& y_padded_strides,       \
+                             const T* y_data,                               \
+                             const TArray<fast_divmod>& fdm_output_strides, \
+                             T* output_data,                                \
+                             size_t count);
+
+SPECIALIZED_IMPL(uint8_t)
+SPECIALIZED_IMPL(int32_t)
+SPECIALIZED_IMPL(int64_t)
+SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(double_t)
+SPECIALIZED_IMPL(half)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+void WhereImpl(
+    hipStream_t stream,
+    size_t output_rank_or_simple_broadcast,
+    BroadcastIndexType cond_index_type,
+    const TArray<int64_t>& cond_padded_strides,
+    const bool* cond_data,
+    BroadcastIndexType x_index_type,
+    const TArray<int64_t>& x_padded_strides,
+    const T* x_data,
+    BroadcastIndexType y_index_type,
+    const TArray<int64_t>& y_padded_strides,
+    const T* y_data,
+    const TArray<fast_divmod>& fdm_output_strides,
+    T* output_data,
+    size_t count);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#ifndef NDEBUG
+namespace onnxruntime {
+namespace rocm {
+namespace test {
+
+// Test header provides function declarations in EP-side bridge.
+bool TestDeferredRelease();
+bool TestDeferredReleaseWithoutArena();
+bool TestBeamSearchTopK();
+
+}  // namespace test
+}  // namespace rocm
+}  // namespace onnxruntime
+#endif
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
+#ifndef NDEBUG
+
+#include "contrib_ops/rocm/transformers/beam_search_topk.h"
+
+#include <algorithm>
+#include <numeric>
+#include <queue>
+#include <random>
+
+#include <hip/hip_runtime.h>
+
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-result"
+
+namespace onnxruntime {
+namespace rocm {
+namespace test {
+
+void FillAndShuffle(std::vector<float>& values, int32_t batch_size, int32_t beam_size, int32_t vocab_size) {
+  std::random_device rd;
+  std::mt19937 generator(rd());
+  for (int32_t batch = 0; batch < batch_size; batch++) {
+    int32_t batch_base_idx = batch * beam_size * vocab_size;
+    for (int32_t beam = 0; beam < beam_size; beam++) {
+      int32_t value = beam;
+      int32_t beam_base_idx = beam * vocab_size;
+      for (int32_t vocab = 0; vocab < vocab_size; vocab++) {
+        values[batch_base_idx + beam_base_idx + vocab] = (float)(value);
+        value += beam_size;
+      }
+      std::shuffle(values.begin() + batch_base_idx + beam_base_idx,
+                   values.begin() + batch_base_idx + beam_base_idx + vocab_size,
+                   generator);
+    }
+  }
+}
+
+void ComputeTopKReference(const std::vector<float>& values,
+                          std::vector<float>& top_k_values,
+                          std::vector<int32_t>& top_k_tokens,
+                          std::vector<int32_t>& top_k_indices,
+                          int32_t batch_size,
+                          int32_t beam_size,
+                          int32_t vocab_size,
+                          int32_t k) {
+  using VK = std::pair<float, int32_t>;
+
+  for (int32_t b = 0; b < batch_size; b++) {
+    std::priority_queue<VK, std::vector<VK>, std::greater<VK>> queue;
+
+    int32_t base_idx = b * beam_size * vocab_size;
+
+    // initialize queue with k elements
+    for (int32_t i = 0; i < k; i++) {
+      queue.push({values[base_idx + i], i});
+    }
+    for (int32_t i = k; i < beam_size * vocab_size; i++) {
+      if (values[base_idx + i] > queue.top().first) {
+        queue.pop();
+        queue.push({values[base_idx + i], i});
+      }
+    }
+
+    int32_t top_k_base_idx = b * k;
+    for (int32_t i = k - 1; i >= 0; i--) {
+      top_k_values[top_k_base_idx + i] = queue.top().first;
+      top_k_tokens[top_k_base_idx + i] = queue.top().second % vocab_size;
+      top_k_indices[top_k_base_idx + i] = queue.top().second / vocab_size;
+      queue.pop();
+    }
+  }
+}
+
+bool TestBeamSearchTopK() {
+  int32_t batch_size = 4;
+  int32_t beam_size = 4;
+  int32_t vocab_size = 50257;
+  int32_t k = 2 * beam_size;
+  int32_t batch_x_beam_x_vocab = batch_size * beam_size * vocab_size;
+  std::vector<float> values(batch_x_beam_x_vocab);
+  FillAndShuffle(values, batch_size, beam_size, vocab_size);
+
+  std::vector<float> top_k_values_ref(batch_size * k);
+  std::vector<int32_t> top_k_tokens_ref(batch_size * k);
+  std::vector<int32_t> top_k_indices_ref(batch_size * k);
+  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
+
+  const int32_t max_vocab_parts = 128;
+  size_t buffer_size = batch_x_beam_x_vocab * 4                                      // input
+                       + batch_size * beam_size * k * (max_vocab_parts + 1) * 2 * 4  // tmp
+                       + batch_size * k * 3 * 4;                                     // output size
+  void* rocm_buffer = nullptr;
+  hipMalloc(&rocm_buffer, buffer_size);
+  float* values_device = (float*)rocm_buffer;
+  float* top_k_1st_values_tmp = (float*)(values_device + batch_x_beam_x_vocab);
+  int32_t* top_k_1st_tokens_tmp = (int32_t*)(top_k_1st_values_tmp + batch_size * beam_size * k * max_vocab_parts);
+  float* top_k_2nd_values_tmp = (float*)(top_k_1st_tokens_tmp + batch_size * beam_size * k * max_vocab_parts);
+  int32_t* top_k_2nd_tokens_tmp = (int32_t*)(top_k_2nd_values_tmp + batch_size * beam_size * k);
+  float* top_k_value = (float*)(top_k_2nd_tokens_tmp + batch_size * beam_size * k);
+  int32_t* top_k_token = (int32_t*)(top_k_value + batch_size * k);
+  int32_t* top_k_indices = (int32_t*)(top_k_token + batch_size * k);
+  hipMemcpy(values_device, values.data(), batch_x_beam_x_vocab * 4, hipMemcpyHostToDevice);
+
+  contrib::rocm::BeamSearchTopK(values_device,
+                                batch_size,
+                                beam_size,
+                                vocab_size,
+                                k,
+                                top_k_1st_values_tmp,
+                                top_k_1st_tokens_tmp,
+                                top_k_2nd_values_tmp,
+                                top_k_2nd_tokens_tmp,
+                                top_k_value,
+                                top_k_token,
+                                top_k_indices,
+                                NULL /*stream*/);
+
+  std::vector<float> top_k_values_host(batch_size * k);
+  std::vector<int32_t> top_k_token_host(batch_size * k);
+  std::vector<int32_t> top_k_indices_host(batch_size * k);
+  hipMemcpy(top_k_values_host.data(), top_k_value, batch_size * k * 4, hipMemcpyDeviceToHost);
+  hipMemcpy(top_k_token_host.data(), top_k_token, batch_size * k * 4, hipMemcpyDeviceToHost);
+  hipMemcpy(top_k_indices_host.data(), top_k_indices, batch_size * k * 4, hipMemcpyDeviceToHost);
+  for (int32_t i = 0; i < batch_size * k; i++) {
+    if (top_k_values_ref[i] != top_k_values_host[i] ||
+        top_k_tokens_ref[i] != top_k_token_host[i] ||
+        top_k_indices_ref[i] != top_k_indices_host[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+}  // namespace test
+}  // namespace rocm
+}  // namespace onnxruntime
+#endif
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This test is built only under DEBUG mode because it requires
+// extra code in the core of ROCM EP and that code may
+//  1. slow down performance critical applications and
+//  2. increase binary size of ORT.
+#ifndef NDEBUG
+#include <iostream>
+#include "core/providers/rocm/test/all_tests.h"
+#include "core/providers/rocm/rocm_execution_provider.h"
+#include "core/providers/rocm/rocm_allocator.h"
+
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-result"
+
+namespace onnxruntime {
+namespace rocm {
+namespace test {
+
+bool TestDeferredRelease() {
+  // Create ROCM EP.
+  ROCMExecutionProviderInfo info;
+  ROCMExecutionProvider ep(info);
+  // Initialize allocators in EP.
+  onnxruntime::AllocatorManager allocator_manager;
+  ep.RegisterAllocator(allocator_manager);
+  // Allocator for call hipHostMalloc and hipHostFree
+  // For details, see ROCMPinnedAllocator in rocm_allocator.cc.
+  AllocatorPtr cpu_pinned_alloc = ep.GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
+  // 10 MB
+  const size_t n_bytes = 10 * 1000000;
+  const int64_t n_allocs = 64;
+  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  for (size_t i = 0; i < n_allocs; ++i) {
+    // Allocate 10MB ROCM pinned memory.
+    auto pinned_buffer = ep.AllocateBufferOnCPUPinned<void>(n_bytes);
+    // Release it using ROCM callback.
+    ep.AddDeferredReleaseCPUPtr(pinned_buffer.release());
+  }
+
+  // Memory stats
+  AllocatorStats stats;
+  cpu_pinned_alloc->GetStats(&stats);
+  ORT_ENFORCE(stats.num_allocs == n_allocs);
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  return true;
+}
+
+bool TestDeferredReleaseWithoutArena() {
+  // Create ROCM EP.
+  ROCMExecutionProviderInfo info;
+  ROCMExecutionProvider ep(info);
+  // Initialize allocators in EP.
+  onnxruntime::AllocatorManager allocator_manager;
+
+  OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
+  // Create allocator without BFCArena
+  AllocatorCreationInfo pinned_memory_info(
+      [](OrtDevice::DeviceId device_id) {
+        return std::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
+      },
+      pinned_device.Id(),
+      false /* no arena */);
+  auto rocm_pinned_alloc = CreateAllocator(pinned_memory_info);
+  allocator_manager.InsertAllocator(rocm_pinned_alloc);
+
+  // Use existing allocator in allocator_manager.
+  // Also register new allocator created by this EP in allocator_manager.
+  ep.RegisterAllocator(allocator_manager);
+  // Allocator for call hipHostMalloc and hipHostFree
+  // For details, see ROCMPinnedAllocator in rocm_allocator.cc.
+  AllocatorPtr cpu_pinned_alloc = ep.GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
+  // 10 MB
+  const size_t n_bytes = 10 * 1000000;
+  const int64_t n_allocs = 64;
+  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  for (size_t i = 0; i < n_allocs; ++i) {
+    // Allocate 10MB ROCM pinned memory.
+    auto pinned_buffer = ep.AllocateBufferOnCPUPinned<void>(n_bytes);
+    // Release it using ROCM callback.
+    ep.AddDeferredReleaseCPUPtr(pinned_buffer.release());
+  }
+
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  return true;
+}
+
+}  // namespace test
+}  // namespace rocm
+}  // namespace onnxruntime
+#endif