add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/reduction/reduction_functions.h"
+
+#include <algorithm>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include "core/common/common.h"
+#include "core/providers/rocm/atomic/common.cuh"
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/providers/rocm/reduction/reduction_utils.cuh"
+#include "core/providers/rocm/cu_inc/unary_elementwise_impl.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+namespace detail {
+constexpr auto MAX_NUM_ELEMENTS_PER_THREAD = 4;
+constexpr auto MAX_NUM_WARPS_PER_BLOCK = 8;
+constexpr auto MAX_NUM_BLOCKS_IN_GRID_ROW = 256;
+constexpr auto MAX_NUM_GRID_ROWS = 32768;
+
+dim3 compute_block_dim(int num_cols) {
+  const int x = GPU_WARP_SIZE_HOST;
+  const int y = std::min(MAX_NUM_WARPS_PER_BLOCK, std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * x)));
+  return dim3(x, y);
+}
+
+std::pair<dim3, dim3> compute_grid_and_block_dims(int num_rows, int num_cols) {
+  const auto block_dim = compute_block_dim(num_cols);
+  const auto grid_x =
+      std::min<int>(
+          MAX_NUM_BLOCKS_IN_GRID_ROW,
+          std::max<int>(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * block_dim.x * block_dim.y)));
+  const auto grid_y = std::min(MAX_NUM_GRID_ROWS, num_rows);
+  const dim3 grid_dim(grid_x, grid_y);
+  return {grid_dim, block_dim};
+}
+
+uintptr_t round_up_to_aligned(uintptr_t original, size_t alignment) {
+  assert((alignment & (alignment - 1)) == 0);
+  const size_t alignment_mask = ~(alignment - 1);
+  return (original + alignment - 1) & alignment_mask;
+}
+
+/**
+ * call_reduce_matrix_columns() intermediate buffer layout
+ *
+ * Given buffer element type TBuf, the intermediate buffer layout looks like this:
+ *
+ * -----
+ * m * num_blocks_per_row * sizeof(TBuf) bytes for block reductions per row
+ * alignment padding bytes as needed
+ * m * sizeof(int) bytes for block done counts per row
+ * -----
+ */
+
+size_t compute_reduce_matrix_columns_intermediate_buffer_size(
+    int element_size, int num_rows, int num_cols) {
+  ORT_ENFORCE(element_size >= 0 && num_rows >= 0 && num_cols >= 0);
+
+  const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first;
+
+  size_t buffer_size{};
+
+  // at the beginning, for sizing purposes, assume we are aligned
+  buffer_size += static_cast<size_t>(num_rows) * grid_dim.x * element_size;
+
+  buffer_size = round_up_to_aligned(buffer_size, alignof(int));
+  buffer_size += static_cast<size_t>(num_rows) * sizeof(int);
+
+  // add padding to give us room to align
+  buffer_size += alignof(max_align_t) - 1;
+
+  return buffer_size;
+}
+
+template <typename TBuf>
+Status get_reduction_buffers(
+    int num_rows, int num_cols, void* buffer, size_t buffer_size,
+    TBuf*& block_reductions_buffer, int*& block_done_counts_buffer) {
+  const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first;
+
+  const uintptr_t begin_addr = reinterpret_cast<uintptr_t>(buffer);
+  const uintptr_t block_reductions_addr =
+      round_up_to_aligned(begin_addr, alignof(TBuf));
+  const uintptr_t block_done_counts_buffer_addr =
+      round_up_to_aligned(
+          block_reductions_addr + static_cast<size_t>(num_rows) * grid_dim.x * sizeof(TBuf), alignof(int));
+  const uintptr_t end_addr =
+      block_done_counts_buffer_addr + static_cast<size_t>(num_rows) * sizeof(int);
+  const size_t required_size = end_addr - begin_addr;
+
+  ORT_RETURN_IF_NOT(
+      required_size <= buffer_size,
+      "Buffer size is too small (", buffer_size, " bytes). ",
+      "At least ", required_size, " bytes are needed from the given base address (", buffer, ").");
+
+  block_reductions_buffer = reinterpret_cast<TBuf*>(block_reductions_addr);
+  block_done_counts_buffer = reinterpret_cast<int*>(block_done_counts_buffer_addr);
+
+  return Status::OK();
+}
+
+template <typename TIn, typename TOut, typename TBuf, typename TOp, typename TFinalOp, bool DivideResultBySize>
+__device__ void reduce_all(
+    const int num_elements, const TIn* const input, TOut* const output,
+    TBuf* const block_reductions_buffer, int* const block_done_count_buffer) {
+  extern __shared__ unsigned char shared_memory_bytes[];
+  TBuf* shared_memory = reinterpret_cast<TBuf*>(shared_memory_bytes);
+  // Thread-level indices:
+  // Linear index of thread in block.
+  const int tid_in_block = threadIdx.y * blockDim.x + threadIdx.x;
+  // Total number of threads in a 2-D block.
+  const int num_threads_in_block = blockDim.x * blockDim.y;
+
+  // Warp-level indices:
+  // Warp index of thread.
+  const int wid_in_block = tid_in_block / GPU_WARP_SIZE;
+  // Lane index of thread.
+  const int lid_in_block = tid_in_block % GPU_WARP_SIZE;
+  // Warp count per block.
+  const int num_warps_in_block = num_threads_in_block / GPU_WARP_SIZE;
+
+  // Grid-level indices:
+  // Linear index of block in grid row.
+  const int bid_in_grid_row = blockIdx.x;
+  // Linear index of thread in grid row.
+  const int tid_in_grid_row = bid_in_grid_row * (blockDim.x * blockDim.y) + tid_in_block;
+  // Total number of blocks in a grid row.
+  const int num_blocks_in_grid_row = gridDim.x;
+  // Total number of threads in a grid row with 2-D blocks.
+  const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block;
+
+  const auto write_result = [&output, &num_elements](const TOut result) {
+    // Compilation time if-else branch controlled by template argument can be
+    // optimized out, so there will be no branch in real computation phase.
+    if (DivideResultBySize) {
+      output[0] = TFinalOp()(result / TOut(num_elements));
+    } else {
+      output[0] = TFinalOp()(result);
+    }
+  };
+
+  // Thread-level reduction (storage change: global memory -> register).
+  // One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register
+  // in one iteration.
+  TBuf value = 0;
+  for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) {
+    TIn v[MAX_NUM_ELEMENTS_PER_THREAD];
+
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
+      const int offset = id + i * num_threads_in_grid_row;
+      if (offset < num_elements) {
+        v[i] = input[offset];
+      }
+    }
+
+#pragma unroll
+    for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
+      const int offset = id + i * num_threads_in_grid_row;
+      if (offset < num_elements) {
+        value += TOp()(TBuf(v[i]));
+      }
+    }
+  }
+
+#if __CUDA_ARCH__ >= 700
+  __syncwarp();
+#else
+  __syncthreads();
+#endif
+
+  // Warp-level reduction (storage change: register -> register).
+  // The values in a warp will be summed up to a scalar. After warp-level
+  // reduction, each block holds num_warps_in_block values in the shared memory.
+#pragma unroll
+  for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+    value += WARP_SHFL_DOWN(value, stride);
+  }
+
+  // Return early if only one warp is used for reduction.
+  // Given a fixed amount of threads, we prefer threads over warps over blocks so that we never have cases such as
+  // 1. two blocks and each of them has only 1 warp (32 threads).
+  // 2. two warps and each of them has only 2 threads.
+  if (num_warps_in_block == 1) {
+    if (tid_in_grid_row == 0) {
+      write_result(value);
+    }
+    return;
+  }
+
+  if (lid_in_block == 0) {
+    shared_memory[wid_in_block] = value;
+  }
+
+  __syncthreads();
+
+  // Block-level reduction (storage change: shared memory -> global memory).
+  // The values in a block will be summed up to a scalar.
+  // Note that the values are stored in the shared memory.
+  // Here we assume that the size of shared_memory is smaller
+  // than num_warps_in_block, so we just keep halving the number
+  // of threads in each iteration. Our assumption is always true because
+  // the size of shared_memory equals to the number of warps.
+#pragma unroll
+  for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) {
+    if (tid_in_block + stride < num_warps_in_block) {
+      shared_memory[tid_in_block] += shared_memory[tid_in_block + stride];
+    }
+    __syncthreads();
+  }
+
+  // Return early if only one block is used for reduction.
+  if (num_blocks_in_grid_row == 1) {
+    if (tid_in_grid_row == 0) {
+      write_result(shared_memory[0]);
+    }
+    return;
+  }
+
+  if (tid_in_block == 0) {
+    block_reductions_buffer[bid_in_grid_row] = shared_memory[0];
+  }
+
+  __threadfence();
+  __syncthreads();
+
+  // Grid-level reduction. We use the last block to sum up values
+  // stored in the global block_reductions_buffer.
+  __shared__ bool is_last_block_done;
+
+  if (tid_in_block == 0) {
+    const int count = atomicAdd(block_done_count_buffer, 1);
+    is_last_block_done = (count == (num_blocks_in_grid_row - 1));
+  }
+
+  // All threads in each block see if they belong the last active block
+  // (i.e., the value of is_last_block_done).
+  __syncthreads();
+
+  // Only the block which saw that count equals to num_blocks_in_grid_row - 1 can
+  // enter the following block.
+  if (is_last_block_done) {
+    const int pow2_bound = least_pow2_bound(num_blocks_in_grid_row);
+    for (int stride = pow2_bound / 2; stride > 0; stride /= 2) {
+      if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid_row) {
+        block_reductions_buffer[tid_in_block] += block_reductions_buffer[tid_in_block + stride];
+      }
+      __syncthreads();
+    }
+
+    // The first thread in the last block assigns the final output.
+    if (tid_in_block == 0) {
+      write_result(block_reductions_buffer[0]);
+    }
+  }
+}
+
+template <typename TIn, typename TOut, typename TBuf, typename TOp, typename TFinalOp, bool DivideResultBySize>
+__global__ void reduce_matrix_columns_kernel(
+    const int num_rows, const int num_cols, const TIn* const input, TOut* const output,
+    TBuf* const block_reductions_buffer, int* const block_done_counts_buffer) {
+  const int num_blocks_in_grid_row = gridDim.x;
+  const int row_id_in_grid = blockIdx.y;
+  const int num_grid_rows = gridDim.y;
+
+  // one row per iteration
+  // row_id is int64_t to avoid int overflow in offset calculations
+  for (int64_t row_id = row_id_in_grid; row_id < num_rows; row_id += num_grid_rows) {
+    const TIn* const row_data = input + row_id * num_cols;
+    TOut* const row_output = output + row_id;
+    TBuf* const row_block_reductions_buffer = block_reductions_buffer + row_id * num_blocks_in_grid_row;
+    int* const row_block_done_counts_buffer = block_done_counts_buffer + row_id;
+
+    reduce_all<TIn, TOut, TBuf, TOp, TFinalOp, DivideResultBySize>(
+        num_cols, row_data, row_output,
+        row_block_reductions_buffer, row_block_done_counts_buffer);
+  }
+}
+
+template <typename TIn, typename TOut, typename TOp, typename TFinalOp, bool DivideResultBySize>
+Status call_reduce_matrix_columns(
+    hipStream_t stream, const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) {
+  ORT_ENFORCE(num_rows >= 0 && num_cols >= 0);
+
+  using TBuf = AccumulationType_t<TIn>;
+
+  const auto grid_and_block_dims = compute_grid_and_block_dims(num_rows, num_cols);
+  const dim3& grid_dim = grid_and_block_dims.first;
+  const dim3& block_dim = grid_and_block_dims.second;
+
+  TBuf* block_reductions_buffer;
+  int* block_done_counts_buffer;
+  ORT_RETURN_IF_ERROR(get_reduction_buffers(
+      num_rows, num_cols, buffer, buffer_size,
+      block_reductions_buffer, block_done_counts_buffer));
+
+  // If more than one block is used per grid row, then inter-block reduction is needed.
+  if (grid_dim.x > 1) {
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int), stream));
+  }
+
+  const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE_HOST;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_columns_kernel<TIn, TOut, TBuf, TOp, TFinalOp, DivideResultBySize>), grid_dim, block_dim, shared_mem_size, stream, 
+          num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer);
+
+  return Status::OK();
+}
+}  // namespace detail
+
+template <typename TIn, typename TOut>
+Status reduce_sum(
+    hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
+  return detail::call_reduce_matrix_columns<TIn, TOut, Identity, Identity, false>(
+    stream, input, output, 1, size, buffer, buffer_size);
+}
+
+template <typename TIn, typename TOut>
+Status reduce_square_sum(
+    hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
+  return detail::call_reduce_matrix_columns<TIn, TOut, Square, Identity, false>(
+    stream, input, output, 1, size, buffer, buffer_size);
+}
+
+template <typename TIn, typename TOut>
+Status reduce_l2_norm(
+    hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
+  return detail::call_reduce_matrix_columns<TIn, TOut, Square, Sqrt, false>(
+    stream, input, output, 1, size, buffer, buffer_size);
+}
+
+template <typename TIn, typename TOut>
+Status reduce_mean(
+    hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
+  return detail::call_reduce_matrix_columns<TIn, TOut, Identity, Identity, true>(
+    stream, input, output, 1, size, buffer, buffer_size);
+}
+
+#define INSTANTIATE_REDUCE_SUM(TIn, TOut) \
+  template Status reduce_sum<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
+INSTANTIATE_REDUCE_SUM(half, half);
+INSTANTIATE_REDUCE_SUM(half, float);
+INSTANTIATE_REDUCE_SUM(float, float);
+INSTANTIATE_REDUCE_SUM(double, double);
+INSTANTIATE_REDUCE_SUM(BFloat16, BFloat16);
+INSTANTIATE_REDUCE_SUM(BFloat16, float);
+#undef INSTANTIATE_REDUCE_SUM
+
+#define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \
+  template Status reduce_square_sum<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
+INSTANTIATE_REDUCE_SQUARE_SUM(half, float);
+INSTANTIATE_REDUCE_SQUARE_SUM(float, float);
+INSTANTIATE_REDUCE_SQUARE_SUM(double, double);
+INSTANTIATE_REDUCE_SQUARE_SUM(BFloat16, float);
+#undef INSTANTIATE_REDUCE_SQUARE_SUM
+
+#define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \
+  template Status reduce_l2_norm<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
+INSTANTIATE_REDUCE_L2_NORM(half, float);
+INSTANTIATE_REDUCE_L2_NORM(float, float);
+INSTANTIATE_REDUCE_L2_NORM(double, double);
+#undef INSTANTIATE_REDUCE_L2_NORM
+
+#define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \
+  template Status reduce_mean<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
+INSTANTIATE_REDUCE_MEAN(half, float);
+INSTANTIATE_REDUCE_MEAN(float, float);
+INSTANTIATE_REDUCE_MEAN(double, double);
+#undef INSTANTIATE_REDUCE_MEAN
+
+namespace detail {
+template <typename TIn, typename TOut, typename TBuf>
+__global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, int n) {
+  constexpr int x_load_count_per_thread = 1;
+  constexpr int y_load_count_per_thread = 4;
+  const int t_count_x_in_grid = blockDim.x * gridDim.x;
+  const int t_count_y_in_grid = blockDim.y * gridDim.y;
+  const int x_grid_stride = t_count_x_in_grid * x_load_count_per_thread;
+  const int y_grid_stride = t_count_y_in_grid * y_load_count_per_thread;
+  const int tid_x_in_grid = threadIdx.x + blockDim.x * blockIdx.x;
+  const int tid_y_in_grid = threadIdx.y + blockDim.y * blockIdx.y;
+  const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y;
+
+  // Shape is blockDim.y-by-blockDim.x and element type is TBuf.
+  extern __shared__ unsigned char shared_memory_bytes[];
+  TBuf* shared_memory = reinterpret_cast<TBuf*>(shared_memory_bytes);
+
+  // to prevent int overflow in index calculation for input size m*n
+  const int64_t n_int64 = static_cast<int64_t>(n);
+
+  for (int col = tid_x_in_grid; col < n; col += x_grid_stride) {
+    shared_memory[tid_in_block] = TBuf(0.0f);
+    TBuf sum = TBuf(0.0f);
+    // This loops load multiple blockDim.y-by-blockDim.x sub-tensors from the input.
+    for (int row = tid_y_in_grid; row < m; row += y_grid_stride) {
+      // Thread-level reduction. Each thread loads y_load_count_per_thread values
+      // and aggregrate them.
+#pragma unroll y_load_count_per_thread
+      for (int row_inner = 0; row_inner < y_load_count_per_thread; ++row_inner) {
+        int row_final = row + row_inner * t_count_y_in_grid;
+        int col_final = col;
+        if (row_final < m && col_final < n) {
+          sum += TBuf(input[row_final * n_int64 + col_final]);
+        }
+      }
+    }
+    // Write thread-level reduction result into shared memory.
+    shared_memory[tid_in_block] = sum;
+
+    // Wait all threads to finish their thread-level reductions.
+    __syncthreads();
+
+// This loop conducts reduction on elements stored in shared memory.
+// Each block reduces blockDim.y-by-blockDim.x tensor to 1-by-blockDim.x tensor.
+#pragma unroll 4
+    for (int stride = blockDim.y / 2; stride > 0; stride /= 2) {
+      if (threadIdx.y < stride) {
+        shared_memory[tid_in_block] += shared_memory[tid_in_block + stride * blockDim.x];
+      }
+      __syncthreads();
+    }
+
+    if (threadIdx.y == 0) {
+      atomic_add(output + col, TOut(shared_memory[threadIdx.x]));
+    }
+  }
+}
+
+template <typename TIn, typename TOut, typename TBuf>
+Status call_reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) {
+  ORT_ENFORCE(m >= 0 && n >= 0);
+
+  if (reset_initial_output) {
+    HIP_RETURN_IF_ERROR(hipMemsetAsync(output, 0, n * sizeof(TOut), stream));
+  }
+
+  constexpr int max_num_threads_in_block = 512;
+  constexpr int max_num_blocks_in_grid = 512;
+  constexpr int load_count_per_thread = 4;
+
+  const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE_HOST)));
+  const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread)));
+  const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid));
+  const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4));
+
+  const dim3 grid(grid_x_dim, grid_y_dim, 1);
+  const dim3 block(block_x_dim, block_y_dim, 1);
+
+  reduce_matrix_rows_kernel<TIn, TOut, TBuf><<<grid, block, block.y * block.x * sizeof(TBuf), stream>>>(
+      input, output, m, n);
+
+  return Status::OK();
+}
+}  // namespace detail
+
+template <typename T>
+struct OP_Div {
+  __device__ __inline__ T operator()(const T& a) const {
+    return a / v_;
+  }
+
+  OP_Div(T v) : v_(v) {}
+
+  T v_;
+};
+
+template <typename T>
+void UnaryDiv(hipStream_t stream, const T* input, T* output, T denominator, size_t count) {
+  UnaryElementWiseImpl(stream, input, output, OP_Div<T>(denominator), count);
+}
+
+#define INSTANTIATE_UNARY_DIV(T) \
+  template void UnaryDiv<T>(hipStream_t stream, const T* input, T* output, T denominator, size_t count)
+INSTANTIATE_UNARY_DIV(half);
+INSTANTIATE_UNARY_DIV(float);
+INSTANTIATE_UNARY_DIV(double);
+INSTANTIATE_UNARY_DIV(BFloat16);
+#undef INSTANTIATE_UNARY_DIV
+
+template <typename TIn, typename TOut>
+Status reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) {
+  using TBuf = AccumulationType_t<TIn>;
+  return detail::call_reduce_matrix_rows<TIn, TOut, TBuf>(stream, input, output, m, n, reset_initial_output);
+}
+
+#define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \
+  template Status reduce_matrix_rows<T, T>(hipStream_t stream, const T* input, T* output, int m, int n, bool reset_initial_output)
+INSTANTIATE_REDUCE_MATRIX_ROWS(half);
+INSTANTIATE_REDUCE_MATRIX_ROWS(float);
+INSTANTIATE_REDUCE_MATRIX_ROWS(double);
+INSTANTIATE_REDUCE_MATRIX_ROWS(BFloat16);
+#undef INSTANTIATE_REDUCE_MATRIX_ROWS
+
+template <typename TIn, typename TOut>
+Status reduce_matrix_columns(hipStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) {
+  return detail::call_reduce_matrix_columns<TIn, TOut, Identity, Identity, false>(
+    stream, input, output, m, n, buffer, buffer_size);
+}
+
+#define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \
+  template Status reduce_matrix_columns<T, T>(hipStream_t stream, const T* input, T* output, int m, int n, void* buffer, size_t buffer_size)
+INSTANTIATE_REDUCE_MATRIX_COLUMNS(half);
+INSTANTIATE_REDUCE_MATRIX_COLUMNS(float);
+INSTANTIATE_REDUCE_MATRIX_COLUMNS(double);
+INSTANTIATE_REDUCE_MATRIX_COLUMNS(BFloat16);
+#undef INSTANTIATE_REDUCE_MATRIX_COLUMNS
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_common.h"
+#include "core/providers/rocm/shared_inc/accumulation_type.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+namespace detail {
+size_t compute_reduce_matrix_columns_intermediate_buffer_size(
+    int element_size, int num_rows, int num_cols);
+}  // namespace detail
+
+/**
+ * Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns().
+ * @tparam TIn The input data type.
+ * @param m The number of matrix rows.
+ * @param n The number of matrix columns.
+ * @return The size of the intermediate buffer.
+ */
+template <typename TIn>
+size_t compute_reduce_matrix_columns_buffer_size(int m, int n) {
+  using TBuf = AccumulationType_t<TIn>;
+  return detail::compute_reduce_matrix_columns_intermediate_buffer_size(
+      sizeof(TBuf), m, n);
+}
+
+/**
+ * Computes the size in bytes of the intermediate buffer needed by the reduce_x() functions.
+ * @tparam TIn The input data type.
+ * @param size The number of elements.
+ * @return The size of the intermediate buffer.
+ */
+template <typename TIn>
+size_t compute_reduction_buffer_size(int size) {
+  using TBuf = AccumulationType_t<TIn>;
+  return detail::compute_reduce_matrix_columns_intermediate_buffer_size(
+      sizeof(TBuf), 1, size);
+}
+
+/** Computes the sum of the given elements. */
+template <typename TIn, typename TOut>
+Status reduce_sum(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
+
+/** Computes the sum of the squares of the given elements. */
+template <typename TIn, typename TOut>
+Status reduce_square_sum(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
+
+/** Computes the L2 norm of the given elements. */
+template <typename TIn, typename TOut>
+Status reduce_l2_norm(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
+
+/** Computes the mean of the given elements. */
+template <typename TIn, typename TOut>
+Status reduce_mean(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
+
+enum class ApplicableMatrixReduction {
+  // can use reduce_matrix_rows()
+  Rows,
+  // can use reduce_matrix_columns()
+  Columns,
+  // no optimized matrix reduction function applies
+  None,
+};
+
+/**
+ * Determines whether a cuDNN reduction can be computed by an optimized matrix reduction function.
+ * @param miopen_reduce_op The cuDNN reduction op type.
+ * @param dims The input dimensions.
+ * @param axes The reduction axes.
+ * @param[out] m If matrix reduction is possible, the number of matrix rows to use.
+ * @param[out] n If matrix reduction is possible, the number of matrix columns to use.
+ * @return The type of matrix reduction that can be done.
+ */
+ApplicableMatrixReduction get_applicable_matrix_reduction(
+    const miopenReduceTensorOp_t miopen_reduce_op,
+    gsl::span<const int64_t> dims, gsl::span<const int64_t> axes,
+    int& m, int& n);
+
+/**
+ * Reduces the rows in a row-major matrix to a single row containing the sum of each column.
+ * @param input The input data.
+ * @param output The output data.
+ * @param m The number of matrix rows.
+ * @param n The number of matrix columns.
+ * @param reset_initial_output Whether to reset (i.e., zero) the output values first.
+ */
+template <typename TIn, typename TOut>
+Status reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true);
+
+/**
+ * Reduces the columns in a row-major matrix to a single column containing the sum of each row.
+ * @param input The input data.
+ * @param output The output data.
+ * @param m The number of matrix rows.
+ * @param n The number of matrix columns.
+ * @param buffer The intermediate buffer.
+ * @param buffer_size The size of the intermediate buffer in bytes.
+ */
+template <typename TIn, typename TOut>
+Status reduce_matrix_columns(hipStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size);
+
+/** Apply unary elementwise division. */
+template <typename T>
+void UnaryDiv(hipStream_t stream, const T* input, T* output, T denominator, size_t count);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_ops.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_ops.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/optional.h"
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/reduction/reduction_ops.h"
+#include "core/providers/rocm/reduction/reduction_functions.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+namespace ReductionOps {
+
+// Implementation that holds the core logic of reduction op processing
+// `input_shape_override` is the input shape for compute purposes (if provided)
+
+template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
+std::unique_ptr<Tensor> ReduceCompute(ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, AllocatorPtr allocator,
+                                      const Tensor& input, gsl::span<const int64_t> axes,
+                                      bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp,
+                                      bool fast_reduction, const TensorShape* input_shape_override = nullptr);
+
+}  // namespace ReductionOps
+
+// Holds some metadata that will be used during actual reduction op compute time
+struct PrepareReduceMetadata {
+  int64_t input_count;
+  int64_t output_count;
+  // This holds the output dims without any reduced dims squeezed (even if keep_dims == 1)
+  TensorShapeVector output_dims;
+  // This holds the output dims with with reduced dims squeezed (if keep_dims == 1)
+  TensorShapeVector squeezed_output_dims;
+  TensorShapeVector input_dims_miopen;
+  TensorShapeVector output_dims_miopen;
+};
+
+template <bool allow_multi_axes>
+class ReduceKernel : public RocmKernel, public ReduceKernelBase<allow_multi_axes> {
+ protected:
+  ReduceKernel(
+      const OpKernelInfo& info,
+      optional<int64_t> keep_dims_override = {})
+      : RocmKernel(info),
+        ReduceKernelBase<allow_multi_axes>(info, keep_dims_override),
+        calculate_log_(false),
+        calculate_sqt_(false),
+        log_sum_exp_(false),
+        fast_reduction_(false) {
+    // We need to cast away the const as PerThreadMiopenHandle() is currently a non-const method
+    // TODO: Clean up the ROCMExecutionProvider interface to avoid this
+    rocm_ep_ = const_cast<ROCMExecutionProvider*>(static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider()));
+  }
+
+  // Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual
+  // Only Max Min will have indices output, need to set the indices to nullptr for other ops
+  template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
+  Status ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
+
+  // Used by ReduceSumTraining which will have axes as input
+  template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
+  Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
+
+  template <typename T, typename OutT, miopenReduceTensorIndices_t ReduceTensorIndices>
+  Status ReduceKernelShared(
+      const T* X,
+      const TensorShape& input_shape,
+      OutT* Y,
+      const TensorShape& output_shape,
+      miopenReduceTensorOp_t miopen_reduce_op,
+      TensorShapeVector& output_dims) const;
+
+  using ReduceKernelBase<allow_multi_axes>::axes_;
+  using ReduceKernelBase<allow_multi_axes>::keepdims_;
+  using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;
+
+  bool calculate_log_;
+  bool calculate_sqt_;
+  bool log_sum_exp_;
+  // Indicates if this reduction can be delegated to our highly-optimized reduction kernels.
+  // Those efficient kernels are defined/implemented in reduction_functions.h/.cu.
+  bool fast_reduction_;
+
+  // We need to access to the ROCM EP instance to get the miopen handle
+  ROCMExecutionProvider* rocm_ep_;
+};
+
+template <typename T>
+class ArgMax final : public ReduceKernel<false> {
+ public:
+  ArgMax(const OpKernelInfo& info) : ReduceKernel<false>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES>(ctx, MIOPEN_REDUCE_TENSOR_MAX);
+  }
+};
+
+template <typename T>
+class ArgMin final : public ReduceKernel<false> {
+ public:
+  ArgMin(const OpKernelInfo& info) : ReduceKernel<false>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES>(ctx, MIOPEN_REDUCE_TENSOR_MIN);
+  }
+};
+
+template <typename T>
+class ReduceL1 final : public ReduceKernel<true> {
+ public:
+  ReduceL1(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_NORM1);
+  }
+};
+
+template <typename T>
+class ReduceL2 final : public ReduceKernel<true> {
+ public:
+  ReduceL2(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_NORM2);
+  }
+};
+
+template <typename T>
+class ReduceMax final : public ReduceKernel<true> {
+ public:
+  ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MAX);
+  }
+};
+
+template <typename T>
+class ReduceMean final : public ReduceKernel<true> {
+ public:
+  ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+    fast_reduction_ = true;
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_AVG);
+  }
+};
+
+template <typename T>
+class ReduceMin final : public ReduceKernel<true> {
+ public:
+  ReduceMin(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MIN);
+  }
+};
+
+template <typename T>
+class ReduceProd final : public ReduceKernel<true> {
+ public:
+  ReduceProd(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MUL);
+  }
+};
+
+template <typename T>
+class ReduceSum final : public ReduceKernel<true> {
+ public:
+  ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+    fast_reduction_ = true;
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
+  }
+};
+
+template <typename T>
+class ReduceLogSum final : public ReduceKernel<true> {
+ public:
+  ReduceLogSum(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+    ReduceKernel<true>::calculate_log_ = true;
+    fast_reduction_ = true;
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
+  }
+};
+
+template <typename T>
+class ReduceSumSquare final : public ReduceKernel<true> {
+ public:
+  ReduceSumSquare(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+    ReduceKernel<true>::calculate_sqt_ = true;
+    fast_reduction_ = true;
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
+  }
+};
+
+template <typename T>
+class ReduceLogSumExp final : public ReduceKernel<true> {
+ public:
+  ReduceLogSumExp(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+    ReduceKernel<true>::log_sum_exp_ = true;
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
+  }
+};
+
+Status PrepareForReduce(const Tensor* X,
+                        bool keepdims,
+                        gsl::span<const int64_t> axes,
+                        PrepareReduceMetadata& prepare_reduce_metadata,
+                        const TensorShape* input_shape_override = nullptr);
+
+template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
+Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
+                         /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op,
+                         gsl::span<const int64_t> axes,
+                         bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
+                         const TensorShape* input_shape_override = nullptr);
+
+// ROCM's reduction descriptor miopenReduceTensorDescriptor_t is a pointer so
+// it's safer to wrap it with automatically memory deleter as MiopenReduceDescriptor.
+// An implicit caster from MiopenReduceDescriptor to miopenReduceTensorDescriptor_t
+// is implemented below, so ROCM can seamlessly work.
+class MiopenReduceDescriptor final {
+ public:
+  MiopenReduceDescriptor() : desc_(nullptr) {
+  }
+
+  ~MiopenReduceDescriptor() {
+    if (desc_ != nullptr) {
+      miopenDestroyReduceTensorDescriptor(desc_);
+      desc_ = nullptr;
+    }
+  }
+
+  MiopenReduceDescriptor(const MiopenReduceDescriptor&) = delete;
+  MiopenReduceDescriptor& operator=(const MiopenReduceDescriptor&) = delete;
+
+  Status Set(miopenReduceTensorOp_t op, miopenDataType_t type, miopenReduceTensorIndices_t indices) {
+    if (!desc_)
+      MIOPEN_RETURN_IF_ERROR(miopenCreateReduceTensorDescriptor(&desc_));
+
+    MIOPEN_RETURN_IF_ERROR(miopenSetReduceTensorDescriptor(
+        desc_,
+        op,
+        type,
+        MIOPEN_PROPAGATE_NAN,
+        indices,
+        MIOPEN_32BIT_INDICES));  // currently only the 32-bit (unsigned int) type is supported.
+    return Status::OK();
+  }
+
+  operator miopenReduceTensorDescriptor_t() const { return desc_; }
+
+ private:
+  miopenReduceTensorDescriptor_t desc_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/rocm/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+__forceinline__ __host__ __device__ int least_pow2_bound(int value) {
+  unsigned int value_ = static_cast<unsigned int>(value);
+  --value_;
+  value_ |= value_ >> 1;
+  value_ |= value_ >> 2;
+  value_ |= value_ >> 4;
+  value_ |= value_ >> 8;
+  value_ |= value_ >> 16;
+  return static_cast<int>(++value_);
+}
+
+struct Square {
+  template <typename T>
+  __forceinline__ __device__ T operator()(const T& value) {
+    return value * value;
+  }
+};
+
+struct Sqrt {
+  template <typename T>
+  __forceinline__ __device__ T operator()(const T& value) {
+    return _Sqrt(value);
+  }
+};
+
+struct Identity {
+  template <typename T>
+  __forceinline__ __device__ T operator()(const T& value) {
+    return value;
+  }
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/rocm_check_memory.h"
+#include "core/providers/rocm/rocm_common.h"
+
+namespace onnxruntime {
+void CheckIfMemoryOnCurrentGpuDevice(const void* ptr) {
+  hipPointerAttribute_t attrs;
+  HIP_CALL_THROW(hipPointerGetAttributes(&attrs, ptr));
+  int current_device;
+  HIP_CALL_THROW(hipGetDevice(&current_device));
+  ORT_ENFORCE(attrs.device == current_device,
+              "Current ROCM device is ", current_device,
+              " but the memory of pointer ", ptr,
+              " is allocated on device ", attrs.device);
+}
+}  // onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+// Throw if "ptr" is not allocated on the ROCM device obtained by hipGetDevice.
+void CheckIfMemoryOnCurrentGpuDevice(const void* ptr);
+}  // onnxruntime
\ No newline at end of file
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/rocm_graph.h"
+
+#include "core/providers/rocm/rocm_common.h"
+#include <hip/hip_runtime_api.h>
+#include <hip/driver_types.h>
+
+
+namespace onnxruntime {
+
+ROCMGraph::ROCMGraph(hipStream_t stream) : stream_(stream) {
+#if (defined(CUDA_VERSION) && CUDA_VERSION < 10000)
+  ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
+#endif
+}
+
+void ROCMGraph::SetStream(hipStream_t stream) {
+  stream_ = stream;
+}
+
+void ROCMGraph::CaptureBegin() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  ORT_ENFORCE(!has_graph_exec_,
+              "This rocm graph has already captured a graph. "
+              "Create a new instance to capture a new graph.");
+
+  HIP_CALL_THROW(hipStreamSynchronize(stream_));
+  // For now rocm graph can only work with a single thread. In the future, we
+  // will support multiple threads. For multiple threads with multiple graphs
+  // and streams, `hipStreamCaptureModeGlobal` needs to be changed to
+  // `hipStreamCaptureModeThreadLocal`
+  HIP_CALL_THROW(hipStreamBeginCapture(stream_, hipStreamCaptureModeGlobal));
+#else
+  ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
+#endif
+}
+
+void ROCMGraph::CaptureEnd() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  HIP_CALL_THROW(hipStreamEndCapture(stream_, &graph_));
+  if (graph_ == NULL) {
+    ORT_THROW("ROCMGraph::CaptureEnd: graph_ is NULL");
+  }
+
+  has_graph_ = true;
+  HIP_CALL_THROW(hipGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
+  has_graph_exec_ = true;
+  HIP_CALL_THROW(hipGraphDestroy(graph_));
+  has_graph_ = false;
+#else
+  ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
+#endif
+}
+
+Status ROCMGraph::Replay() {
+  // Although this function is not thread safe, the lock is not needed here because
+  // ROCM EP maintains a separate rocm graph per thread
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  LOGS_DEFAULT(INFO) << "Replaying ROCM graph on stream " << stream_;
+  HIP_RETURN_IF_ERROR(hipGraphLaunch(graph_exec_, stream_));
+  HIP_RETURN_IF_ERROR(hipStreamSynchronize(stream_));
+#else
+  ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
+#endif
+  return Status::OK();
+}
+
+void ROCMGraph::Reset() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  if (has_graph_) {
+    HIP_CALL_THROW(hipGraphDestroy(graph_));
+    has_graph_ = false;
+  }
+  if (has_graph_exec_) {
+    HIP_CALL_THROW(hipGraphExecDestroy(graph_exec_));
+    has_graph_exec_ = false;
+  }
+#else
+  ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
+#endif
+}
+
+ROCMGraph::~ROCMGraph() {
+  Reset();
+}
+
+} // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/platform/ort_mutex.h"
+#include "core/providers/rocm/rocm_pch.h"
+
+namespace onnxruntime {
+
+using CaptureId_t = unsigned long long;
+
+struct ROCMGraph {
+  ROCMGraph() {};
+  ROCMGraph(hipStream_t stream);
+  ~ROCMGraph();
+
+  void SetStream(hipStream_t stream);
+  void CaptureBegin();
+  void CaptureEnd();
+  Status Replay();
+  void Reset();
+
+private:
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  hipGraph_t graph_ = NULL;
+  hipGraphExec_t graph_exec_ = NULL;
+#endif
+
+  bool has_graph_ = false;
+  bool has_graph_exec_ = false;
+
+  hipStream_t stream_ = nullptr; // Does not own the stream
+};
+
+} // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_provider_factory_creator.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_provider_factory_creator.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+
+#include "core/providers/providers.h"
+
+struct OrtROCMProviderOptions;
+struct OrtROCMProviderOptionsV2;
+
+namespace onnxruntime {
+// defined in provider_bridge_ort.cc
+struct CudaProviderFactoryCreator {
+  static std::shared_ptr<IExecutionProviderFactory> Create(const OrtROCMProviderOptions* provider_options);
+  static std::shared_ptr<IExecutionProviderFactory> Create(const OrtROCMProviderOptionsV2* provider_options);
+};
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/accumulation_type.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/accumulation_type.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <hip/hip_fp16.h>
+#include "core/framework/float16.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// specifies the auxiliary type to use for accumulation of the given type
+template <typename T>
+struct AccumulationType;
+template <>
+struct AccumulationType<half> { using type = float; };
+template <>
+struct AccumulationType<float> { using type = float; };
+template <>
+struct AccumulationType<double> { using type = double; };
+template <>
+struct AccumulationType<BFloat16> { using type = float; };
+
+template <typename T>
+using AccumulationType_t = typename AccumulationType<T>::type;
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
+//
+// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <hip/hip_runtime.h>
+#include <cmath>
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// The code below is based on section 4 Unsigned division of paper https://gmplib.org/~tege/divcnst-pldi94.pdf
+// In current ORT, fast_divmod is used for calculating the position of a element in tensor,
+// so unsigned integer division from the paper is good enough for ORT. The advantage is that div is very simple,
+// then GPU compiler can do loop unroll easilly when divmod is called in a loop.
+struct fast_divmod {
+  fast_divmod(int d = 1) {
+    d_ = d == 0 ? 1 : d;
+    ORT_ENFORCE(d_ >= 1 && d_ <= static_cast<uint32_t>(std::numeric_limits<int>::max()));
+
+    for (l_ = 0; l_ < 32; l_++)
+      if ((1U << l_) >= d_) break;
+
+    uint64_t one = 1;
+    uint64_t m = ((one << 32) * ((one << l_) - d_)) / d_ + 1;
+    M_ = static_cast<uint32_t>(m);
+    // according to paper, the value of m' should fit in a unsigned integer.
+    ORT_ENFORCE(M_ > 0 && M_ == m);
+  }
+
+  __host__ __device__ inline int div(int n) const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    uint32_t t = __umulhi(M_, n);
+    return (t + n) >> l_;
+#else
+    // Using uint64_t for t, then t + n won't overflow.
+    uint64_t t = ((uint64_t)M_ * n) >> 32;
+    return static_cast<int>((t + n) >> l_);
+#endif
+  }
+
+  __host__ __device__ inline int mod(int n) const {
+    return n - div(n) * d_;
+  }
+
+  __host__ __device__ inline void divmod(int n, int& q, int& r) const {
+    q = div(n);
+    r = n - q * d_;
+  }
+
+  uint32_t d_;  // divisor
+  uint32_t M_;  // m' in the paper.
+  uint32_t l_;  // l_ = ceil(log2(d_))
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+Status GemmInt8(int m,
+                int n,
+                int k,
+                int32_t alpha_matmul,
+                int32_t beta_matmul,
+                const int8_t* a,
+                int lda,
+                const int8_t* b,
+                int ldb,
+                int32_t* c,
+                int ldc,
+                const RocmKernel* rocm_kernel);
+}
+}
\ No newline at end of file
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// for things shared between nvcc and onnxruntime
+// as currently nvcc cannot compile all onnxruntime headers
+
+#pragma once
+
+#include <hip/hip_fp16.h>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "core/common/gsl.h"
+#include "core/framework/float16.h"
+#include "core/providers/rocm/shared_inc/fast_divmod.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+enum class SimpleBroadcast : int32_t {
+  NoBroadcast = (int32_t)-1,
+  LeftScalar = (int32_t)-2,
+  RightScalar = (int32_t)-3,
+  RightPerChannelBatch1 = (int32_t)-4,
+  RightPerChannelBatchN = (int32_t)-5,
+};
+
+enum class BroadcastIndexType : int32_t {
+  NoBroadcast = (int32_t)0,
+  Scalar = (int32_t)1,
+  NeedCompute = (int32_t)2,
+};
+
+template <typename T>
+class IConstantBuffer {
+ public:
+  virtual ~IConstantBuffer(){};
+  virtual const T* GetBuffer(hipStream_t stream, size_t count) = 0;
+};
+
+template <typename T>
+std::unique_ptr<IConstantBuffer<T>> CreateConstantOnes();
+
+template <typename T>
+void Fill(hipStream_t stream, T* output, T value, int64_t count);
+
+/*
+  This is a utility wrapper for arbitrary type array
+  Commonly used for passing small list of metadata during rocm kernel launch
+  It's better to pass the array by value than having another cuMemcpy to pass the data to device.
+*/
+template <typename T, int32_t capacity = 8>
+struct TArray {
+#if defined(USE_ROCM)
+#define TARRAY_CONSTRUCTOR_SPECIFIERS __host__ __device__
+#else
+#define TARRAY_CONSTRUCTOR_SPECIFIERS
+#endif
+
+  TARRAY_CONSTRUCTOR_SPECIFIERS TArray() = default;
+  TARRAY_CONSTRUCTOR_SPECIFIERS TArray(const TArray&) = default;
+  TARRAY_CONSTRUCTOR_SPECIFIERS TArray& operator=(const TArray&) = default;
+
+#undef TARRAY_CONSTRUCTOR_SPECIFIERS
+
+  TArray(int32_t size) : size_(size), data_() {
+    ORT_ENFORCE(
+        0 <= size && size <= capacity,
+        "TArray size must be within range [0, ", capacity, "]. Actual: ", size);
+  }
+
+  TArray(const std::vector<T>& vec) : TArray(static_cast<int32_t>(vec.size())) {
+    static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable.");
+    memcpy(data_, vec.data(), vec.size() * sizeof(T));
+  }
+
+  TArray(gsl::span<const T> vec) : TArray(static_cast<int32_t>(vec.size())) {
+    static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable.");
+    memcpy(data_, vec.data(), vec.size() * sizeof(T));
+  }
+
+  void SetSize(int32_t size) {
+    ORT_ENFORCE(
+        0 <= size && size <= capacity,
+        "TArray size must be within range [0, ", capacity, "]. Actual: ", size);
+    size_ = size;
+  }
+
+  __host__ __device__ int32_t Size() const {
+    return size_;
+  }
+
+  __host__ __device__ T& operator[](int32_t index) {
+    return data_[index];
+  }
+
+  __host__ __device__ __forceinline__ const T& operator[](int32_t index) const {
+    return data_[index];
+  }
+
+  __host__ __device__ T* Data() {
+    return data_;
+  }
+
+  __host__ __device__ const T* Data() const {
+    return data_;
+  }
+
+  static constexpr int32_t Capacity() { return capacity; };
+
+ private:
+  int32_t size_ = 0;
+  T data_[capacity] = {};
+};
+
+// Bitmask tensor is uint_32 type.
+using BitmaskElementType = uint32_t;
+constexpr int kNumBitsPerBitmaskElement = std::numeric_limits<BitmaskElementType>::digits;
+
+template <typename T>
+struct NumericLimits {
+  __inline__ __host__ __device__ static T Min() {
+    return std::numeric_limits<T>::lowest();
+  }
+  __inline__ __host__ __device__ static T Max() {
+    return std::numeric_limits<T>::max();
+  }
+};
+
+template <>
+struct NumericLimits<MLFloat16> {
+  __inline__ __host__ __device__ static half Min() {
+    return -65504.0;
+  }
+  __inline__ __host__ __device__ static half Max() {
+    return 65504.0;
+  }
+};
+
+template <>
+struct NumericLimits<half> {
+  __inline__ __host__ __device__ static half Min() {
+    return -65504.0;
+  }
+  __inline__ __host__ __device__ static half Max() {
+    return 65504.0;
+  }
+};
+
+template <>
+struct NumericLimits<float> {
+  __inline__ __host__ __device__ static float Min() {
+    return -INFINITY;
+  }
+  __inline__ __host__ __device__ static float Max() {
+    return INFINITY;
+  }
+};
+
+template <>
+struct NumericLimits<double> {
+  __inline__ __host__ __device__ static double Min() {
+    return -HUGE_VAL;
+  }
+  __inline__ __host__ __device__ static double Max() {
+    return HUGE_VAL;
+  }
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "cast_op.h"
+#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::common;
+namespace onnxruntime {
+namespace rocm {
+
+const std::vector<MLDataType> castOpTypeConstraints{
+  DataTypeImpl::GetTensorType<MLFloat16>(),
+      DataTypeImpl::GetTensorType<BFloat16>(),
+      DataTypeImpl::GetTensorType<float>(),
+      DataTypeImpl::GetTensorType<double>(),
+      DataTypeImpl::GetTensorType<int8_t>(),
+      DataTypeImpl::GetTensorType<int16_t>(),
+      DataTypeImpl::GetTensorType<int32_t>(),
+      DataTypeImpl::GetTensorType<int64_t>(),
+      DataTypeImpl::GetTensorType<uint8_t>(),
+      DataTypeImpl::GetTensorType<uint16_t>(),
+      DataTypeImpl::GetTensorType<uint32_t>(),
+      DataTypeImpl::GetTensorType<uint64_t>(),
+      DataTypeImpl::GetTensorType<bool>()
+};
+
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Cast,                                                       \
+      kOnnxDomain,                                                \
+      6, 8,                                                       \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
+          .TypeConstraint("T2", castOpTypeConstraints),           \
+      Cast<T>);                                                   \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Cast,                                                       \
+      kOnnxDomain,                                                \
+      9, 12,                                                      \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
+          .TypeConstraint("T2", castOpTypeConstraints),           \
+      Cast<T>);                                                   \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      Cast,                                                       \
+      kOnnxDomain,                                                \
+      13,                                                         \
+      T,                                                          \
+      kRocmExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
+          .TypeConstraint("T2", castOpTypeConstraints),           \
+      Cast<T>);
+
+template <typename SrcT>
+Status Cast<SrcT>::ComputeInternal(OpKernelContext* context) const {
+  typedef typename ToHipType<SrcT>::MappedType CudaSrcT;
+  const Tensor* X = context->Input<Tensor>(0);
+  const TensorShape& shape = X->Shape();
+  Tensor* Y = context->Output(0, shape);
+  const auto* x_data = reinterpret_cast<const CudaSrcT*>(X->Data<SrcT>());
+  size_t count = shape.Size();
+
+#define CASE(TP_TYPE, DstT)                                                                          \
+  case TP_TYPE:                                                                                      \
+    if (count > 0) {                                                                                 \
+      Impl_Cast<CudaSrcT, typename ToHipType<DstT>::MappedType>(                                    \
+          Stream(),                                                                                  \
+          x_data,                                                                                    \
+          reinterpret_cast<typename ToHipType<DstT>::MappedType*>(Y->MutableData<DstT>()), \
+          count);                                                                                    \
+    }                                                                                                \
+    break;
+
+  switch (to_) {
+    CASE(TensorProto_DataType_FLOAT16, MLFloat16)
+    CASE(TensorProto_DataType_BFLOAT16, BFloat16)
+    CASE(TensorProto_DataType_FLOAT, float)
+    CASE(TensorProto_DataType_DOUBLE, double)
+    CASE(TensorProto_DataType_INT8, int8_t)
+    CASE(TensorProto_DataType_INT16, int16_t)
+    CASE(TensorProto_DataType_INT32, int32_t)
+    CASE(TensorProto_DataType_INT64, int64_t)
+    CASE(TensorProto_DataType_UINT8, uint8_t)
+    CASE(TensorProto_DataType_UINT16, uint16_t)
+    CASE(TensorProto_DataType_UINT32, uint32_t)
+    CASE(TensorProto_DataType_UINT64, uint64_t)
+    CASE(TensorProto_DataType_BOOL, bool)
+    case TensorProto_DataType_STRING:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Casting to and from strings is not supported yet.");
+    case TensorProto_DataType_UNDEFINED:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cast op must have 'to' argument of type DataType");
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unexpected 'to' argument value: ", to_);
+  }
+  return Status::OK();
+}
+
+#define SPECIALIZE_IMPL(T) \
+  REGISTER_KERNEL_TYPED(T) \
+  template Status Cast<T>::ComputeInternal(OpKernelContext* context) const;
+
+SPECIALIZE_IMPL(MLFloat16)
+SPECIALIZE_IMPL(float)
+SPECIALIZE_IMPL(double)
+SPECIALIZE_IMPL(int8_t)
+SPECIALIZE_IMPL(int16_t)
+SPECIALIZE_IMPL(int32_t)
+SPECIALIZE_IMPL(int64_t)
+SPECIALIZE_IMPL(uint8_t)
+SPECIALIZE_IMPL(uint16_t)
+SPECIALIZE_IMPL(uint32_t)
+SPECIALIZE_IMPL(uint64_t)
+SPECIALIZE_IMPL(bool)
+SPECIALIZE_IMPL(BFloat16)
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename SrcT>
+class Cast final : public RocmKernel {
+ public:
+  Cast(const OpKernelInfo& info) : RocmKernel(info) {
+    int64_t to;
+    Status status = info.GetAttr("to", &to);
+    ORT_ENFORCE(status.IsOK(), "Attribute to is not set.");
+    to_ = gsl::narrow_cast<ONNX_NAMESPACE::TensorProto_DataType>(to);
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  ONNX_NAMESPACE::TensorProto_DataType to_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "compress.h"
+#include "compress_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Compress,
+    kOnnxDomain,
+    9, 10,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
+    Compress);
+
+// explicit negative axis support
+ONNX_OPERATOR_KERNEL_EX(
+    Compress,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
+    Compress);
+
+Status Compress::ComputeInternal(OpKernelContext* ctx) const {
+  const Tensor* input_tensor = ctx->Input<Tensor>(0);
+  ORT_ENFORCE(input_tensor);
+  size_t rank = input_tensor->Shape().NumDimensions();
+  auto input_dimensions = input_tensor->Shape().GetDims();
+  int64_t axis = 0;
+  if (has_axis_) {
+    axis = HandleNegativeAxis(axis_, rank);
+  }
+
+  const Tensor* condition = ctx->Input<Tensor>(1);
+  ORT_ENFORCE(condition);
+  auto condition_length = condition->Shape().Size();
+  auto condition_data = condition->Data<bool>();
+
+  // if has axis, we need to compress on dimension[axis], otherwise compress on the flattened input data
+  int64_t input_size = input_tensor->Shape().Size();
+  int64_t compress_input_length = has_axis_ ? input_dimensions[axis] : input_size;
+  int64_t valid_condition_length = compress_input_length < condition_length ? compress_input_length : condition_length;
+
+  auto condition_cumulative_sum_buffer = GetScratchBuffer<int32_t>(gsl::narrow<size_t>(valid_condition_length));
+  auto condition_cumulative_sum = condition_cumulative_sum_buffer.get();
+
+  size_t temp_storage_bytes = 0;
+  HIP_RETURN_IF_ERROR(CompressCalcPrefixSumTempStorageBytes(Stream(),
+                                                             reinterpret_cast<const int8_t*>(condition_data),
+                                                             condition_cumulative_sum,
+                                                             gsl::narrow<int>(valid_condition_length),
+                                                             temp_storage_bytes));
+
+  auto temp_buffer = GetScratchBuffer<uint8_t>(temp_storage_bytes);
+  auto d_temp_storage = temp_buffer.get();
+  HIP_RETURN_IF_ERROR(CompressInclusivePrefixSum(Stream(),
+                                                  d_temp_storage,
+                                                  temp_storage_bytes,
+                                                  reinterpret_cast<const int8_t*>(condition_data),
+                                                  condition_cumulative_sum,
+                                                  gsl::narrow<int>(valid_condition_length)));
+
+  // hipMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
+  int32_t positive_condition_count = 0;
+  HIP_RETURN_IF_ERROR(hipMemcpyAsync(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), hipMemcpyDeviceToHost, Stream()));
+
+  std::vector<int64_t> output_dims(input_dimensions.begin(), input_dimensions.end());
+  if (has_axis_) {
+    output_dims[axis] = positive_condition_count;
+  } else {
+    output_dims.resize(1);
+    output_dims[0] = positive_condition_count;
+  }
+
+  TensorShape output_shape(output_dims);
+  auto output_tensor = ctx->Output(0, output_shape);
+  if (positive_condition_count <= 0) {
+    return Status::OK();
+  }
+
+  auto element_bytes = input_tensor->DataType()->Size();
+
+  int64_t axis_right_stride = 1;
+  if (has_axis_) {
+    for (auto i = static_cast<size_t>(axis + 1); i < rank; ++i) {
+      axis_right_stride *= input_dimensions[i];
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(CompressImpl(Stream(),
+                                   element_bytes,
+                                   gsl::narrow_cast<int32_t>(valid_condition_length),
+                                   gsl::narrow_cast<int32_t>(axis_right_stride),
+                                   has_axis_ ? gsl::narrow_cast<int32_t>(input_dimensions[axis])
+                                             : gsl::narrow_cast<int32_t>(input_size),
+                                   gsl::narrow_cast<int32_t>(positive_condition_count),
+                                   condition_cumulative_sum,
+                                   condition_data,
+                                   input_tensor->DataRaw(),
+                                   output_tensor->MutableDataRaw(),
+                                   input_size));
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/common/common.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+class Compress final : public RocmKernel {
+ public:
+  Compress(const OpKernelInfo& info) : RocmKernel(info) {
+    has_axis_ = info.GetAttr("axis", &axis_).IsOK();
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  int64_t axis_;
+  bool has_axis_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <hipcub/hipcub.hpp>
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/rocm_common.h"
+
+//TODO:fix the warnings
+#ifdef _MSC_VER
+#pragma warning(disable : 4244)
+#endif
+
+#include "core/providers/rocm/tensor/compress_impl.h"
+
+#include <thrust/functional.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace onnxruntime {
+namespace rocm {
+
+// This cast is for transform iterator. This type affects the accumulator type width
+// in InclusiveSum(). By default, the accumulator type matches the input, but for int8_t
+// the sum overflows quickly, so we want the source type to match the output (int32_t).
+// see https://github.com/NVIDIA/cub/issues/384
+struct CastToInt32 : public thrust::unary_function<int8_t, int32_t> {
+  __host__ __device__ int32_t operator()(int8_t v) const {
+    return static_cast<int32_t>(v);
+  }
+};
+
+hipError_t CompressCalcPrefixSumTempStorageBytes(hipStream_t stream, const int8_t* condition_data, int32_t* condition_cumulative_sum, int length, size_t& temp_storage_bytes) {
+   auto input_iter = thrust::make_transform_iterator(condition_data, CastToInt32());
+   return hipcub::DeviceScan::InclusiveSum(
+      nullptr, temp_storage_bytes, input_iter, condition_cumulative_sum, length, stream);
+}
+
+hipError_t CompressInclusivePrefixSum(hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, const int8_t* condition_data, int32_t* condition_cumulative_sum, int length) {
+  auto input_iter = thrust::make_transform_iterator(condition_data, CastToInt32());
+  return hipcub::DeviceScan::InclusiveSum(
+      d_temp_storage, temp_storage_bytes, input_iter, condition_cumulative_sum, length, stream);
+}
+
+template <typename T>
+__global__ void _CompressKernel(const int32_t valid_condition_length,
+                                const fast_divmod axis_right_stride_div,
+                                const fast_divmod input_axis_included_stride_div,
+                                const int32_t output_axis_included_stride,
+                                const int32_t* condition_cumulative_sum,
+                                const bool* condition_data,
+                                const T* input_data,
+                                T* output_data,
+                                const HIP_LONG N) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  HIP_LONG output_index = 0;
+
+  int div, mod;
+  input_axis_included_stride_div.divmod(id, div, mod);
+  output_index = output_axis_included_stride * div;
+  axis_right_stride_div.divmod(mod, div, mod);
+
+  if (div < valid_condition_length && condition_data[div]) {
+    output_index += (condition_cumulative_sum[div] - 1) * axis_right_stride_div.d_ + mod;
+    output_data[output_index] = input_data[id];
+  }
+}
+
+Status CompressImpl(hipStream_t stream,
+                    const size_t element_bytes,
+                    const int32_t valid_condition_length,
+                    const int32_t axis_right_stride,
+                    const int32_t input_axis_dim_length,
+                    const int32_t output_axis_dim_length,
+                    const int32_t* condition_cumulative_sum,
+                    const bool* condition_data,
+                    const void* input_data,
+                    void* output_data,
+                    const size_t N) {
+  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+
+  fast_divmod axis_right_stride_div(axis_right_stride);
+  fast_divmod input_axis_included_stride_div(axis_right_stride * input_axis_dim_length);
+  int output_axis_included_stride = axis_right_stride * output_axis_dim_length;
+
+  switch (element_bytes) {
+    case sizeof(int8_t):
+      hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          valid_condition_length,
+          axis_right_stride_div,
+          input_axis_included_stride_div,
+          output_axis_included_stride,
+          condition_cumulative_sum,
+          condition_data,
+          reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
+          reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
+          (HIP_LONG)N);
+      break;
+    case sizeof(int16_t):
+      hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          valid_condition_length,
+          axis_right_stride_div,
+          input_axis_included_stride_div,
+          output_axis_included_stride,
+          condition_cumulative_sum,
+          condition_data,
+          reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
+          reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
+          (HIP_LONG)N);
+      break;
+    case sizeof(int32_t):
+      hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          valid_condition_length,
+          axis_right_stride_div,
+          input_axis_included_stride_div,
+          output_axis_included_stride,
+          condition_cumulative_sum,
+          condition_data,
+          reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
+          reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
+          (HIP_LONG)N);
+      break;
+    case sizeof(int64_t):
+      hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, 
+          valid_condition_length,
+          axis_right_stride_div,
+          input_axis_included_stride_div,
+          output_axis_included_stride,
+          condition_cumulative_sum,
+          condition_data,
+          reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
+          reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
+          (HIP_LONG)N);
+      break;
+    default:
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Compress operator");
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+hipError_t CompressCalcPrefixSumTempStorageBytes(hipStream_t stream, const int8_t* condition_data,
+                                                  int32_t* condition_cumulative_sum, int length, size_t& temp_storage_bytes);
+hipError_t CompressInclusivePrefixSum(hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes,
+                                       const int8_t* condition_data, int32_t* condition_cumulative_sum, int length);
+
+Status CompressImpl(hipStream_t stream,
+                    const size_t element_bytes,
+                    const int32_t valid_condition_length,
+                    const int32_t axis_right_stride,
+                    const int32_t input_axis_dim_length,
+                    const int32_t output_axis_dim_length,
+                    const int32_t* condition_cumulative_sum,
+                    const bool* condition_data,
+                    const void* input_data,
+                    void* output_data,
+                    const size_t N);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/concat.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/concat.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/tensor/concat.h"
+
+#include "core/providers/rocm/tensor/concat_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Concat,
+                                  kOnnxDomain,
+                                  4, 10,
+                                  kRocmExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+                                  Concat);
+
+// opset 11 explicitly support negative axis
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(Concat,
+                                  kOnnxDomain,
+                                  11, 12,
+                                  kRocmExecutionProvider,
+                                  (*KernelDefBuilder::Create())
+                                      .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+                                  Concat);
+
+ONNX_OPERATOR_KERNEL_EX(Concat,
+                        kOnnxDomain,
+                        13,
+                        kRocmExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+                        Concat);
+
+Status Concat::ComputeInternal(OpKernelContext* ctx) const {
+  auto input_count = Node().InputArgCount().front();
+
+  // Hold pointers to the input tensors to be used in the PrepareForCompute() step
+  InlinedTensorsVector input_tensors;
+  input_tensors.reserve(input_count);
+  for (int i = 0; i < input_count; ++i) {
+    input_tensors.push_back(ctx->Input<Tensor>(i));
+  }
+
+  Prepare p;
+  ORT_RETURN_IF_ERROR(PrepareForCompute(ctx, input_tensors, p));
+
+  // Return at this point if output tensor is going to be empty
+  if (p.output_num_elements == 0)
+    return Status::OK();
+
+  std::vector<int64_t> concat_sizes;
+  concat_sizes.reserve(input_count);
+
+  RocmAsyncBuffer<const void*> input_ptr(this, input_count);
+  gsl::span<const void*> input_ptr_cpuspan = input_ptr.CpuSpan();
+  std::vector<int64_t> axis_dimension_input_output_mapping(p.output_tensor->Shape()[p.axis]);
+  int index = 0;
+  for (int i = 0; i < input_count; ++i) {
+    const auto& input = p.inputs[i];
+    concat_sizes.push_back(input.tensor->Shape()[p.axis]);
+    input_ptr_cpuspan[i] = input.tensor->DataRaw();
+    for (int j = 0; j < input.tensor->Shape()[p.axis]; ++j) {
+      axis_dimension_input_output_mapping.at(index++) = i;
+    }
+  }
+
+  auto element_bytes = p.output_tensor->DataType()->Size();
+  int block_size_inside_axis_dim = static_cast<int>(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]);
+  int block_size_including_axis_dim = static_cast<int>(p.output_axis_pitch);
+  if (std::all_of(concat_sizes.begin(), concat_sizes.end(), [&](int64_t size) { return size == concat_sizes[0]; })) {
+    if (input_count <= 32) {
+      TArray<const void*, 32> input_ptr_array(input_count);
+      for (int i = 0; i < input_count; ++i) input_ptr_array[i] = input_ptr_cpuspan[i];
+      ORT_RETURN_IF_ERROR(ConcatSameConcatDimImpl(
+          Stream(), element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes[0],
+          p.output_tensor->MutableDataRaw(), input_ptr_array, static_cast<size_t>(p.output_num_elements)));
+    } else {
+      ORT_RETURN_IF_ERROR(input_ptr.CopyToGpu());
+      ORT_RETURN_IF_ERROR(ConcatSameConcatDimImpl(
+          Stream(), element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes[0],
+          p.output_tensor->MutableDataRaw(), input_ptr.GpuPtr(), static_cast<size_t>(p.output_num_elements)));
+    }
+  } else {
+    RocmAsyncBuffer<int64_t> concat_sizes_gpu(this, concat_sizes);
+    RocmAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, axis_dimension_input_output_mapping);
+    std::vector<int64_t> concat_sizes_range(concat_sizes);
+    for (size_t i = 1; i < concat_sizes_range.size(); ++i) {
+      concat_sizes_range[i] += concat_sizes_range[i - 1];
+    }
+    RocmAsyncBuffer<int64_t> concat_sizes_range_gpu(this, concat_sizes_range);
+    ORT_RETURN_IF_ERROR(concat_sizes_gpu.CopyToGpu());
+    ORT_RETURN_IF_ERROR(axis_dimension_input_output_mapping_gpu.CopyToGpu());
+    ORT_RETURN_IF_ERROR(concat_sizes_range_gpu.CopyToGpu());
+    ORT_RETURN_IF_ERROR(input_ptr.CopyToGpu());
+    ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim,
+                                   concat_sizes_gpu.GpuPtr(), concat_sizes_range_gpu.GpuPtr(),
+                                   axis_dimension_input_output_mapping_gpu.GpuPtr(), p.output_tensor->MutableDataRaw(),
+                                   input_ptr.GpuPtr(), static_cast<size_t>(p.output_num_elements)));
+  }
+
+  return Status::OK();
+}
+}  // namespace rocm
+}  // namespace onnxruntime