add dtk所需文件

1a91fcc2 · gaoqiong · a144865d · 1a91fcc2 · 1a91fcc2 · 1a91fcc2
Commit 1a91fcc2 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// These macros simplifies coding. To add a new op with following steps:
+// 1. Add a new entry in BINARY_OPS() list
+// 2. (optional) Define templated single element operator in binary_elementwise_ops_impl.cu
+// 3. (optional) Implement specialized single element operator
+// 4. Add op kernel class definition in binary_elementwise_ops.h
+// 5. Add op kernel registration and compute specialization in binary_elementwise_ops.cc
+
+#define BINARY_OPS()                                 \
+  BINARY_OP_NAME_EXPR(Add, (a + b))                  \
+  BINARY_OP_NAME_EXPR(Sub, (a - b))                  \
+  BINARY_OP_NAME_EXPR(Mul, (a * b))                  \
+  BINARY_OP_NAME_EXPR(Div, (a / b))                  \
+  BINARY_OP_NAME_EXPR(Pow_7, _Pow(a, b))             \
+  BINARY_OP_NAME_EXPR(And, (a & b))                  \
+  BINARY_OP_NAME_EXPR(Or, (a | b))                   \
+  BINARY_OP_NAME_EXPR(Xor, (a ^ b))                  \
+  BINARY_OP_NAME_EXPR(PRelu, (a > (T)0 ? a : a * b)) \
+  BINARY_OP_NAME_EXPR(Max, _Max(a, b))               \
+  BINARY_OP_NAME_EXPR(Min, _Min(a, b))               \
+  BINARY_OP_NAME_EXPR(Mod, _Mod(a, b))               \
+  BINARY_OP_NAME_EXPR(Fmod, _Fmod(a, b))
+
+// NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers
+// so struct BinaryElementwisePreparation cannot be used here
+
+#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name)    \
+  template <typename T>                              \
+  void Impl_##name(                                  \
+      hipStream_t stream,                           \
+      int32_t output_rank_or_simple_broadcast,       \
+      const TArray<int64_t>* lhs_padded_strides,     \
+      const T* lhs_data,                             \
+      const TArray<int64_t>* rhs_padded_strides,     \
+      const T* rhs_data,                             \
+      const TArray<fast_divmod>* fdm_output_strides, \
+      const fast_divmod& fdm_H,                      \
+      const fast_divmod& fdm_C,                      \
+      T* output_data,                                \
+      size_t count)
+
+#define BINARY_OP_NAME_EXPR(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION(name);
+BINARY_OPS()
+#undef BINARY_OP_NAME_EXPR
+
+#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \
+  template <typename T, typename T1>                 \
+  void ImplT1_##name(                                \
+      hipStream_t stream,                           \
+      int32_t output_rank_or_simple_broadcast,       \
+      const TArray<int64_t>* lhs_padded_strides,     \
+      const T* lhs_data,                             \
+      const TArray<int64_t>* rhs_padded_strides,     \
+      const T1* rhs_data,                            \
+      const TArray<fast_divmod>* fdm_output_strides, \
+      const fast_divmod& fdm_H,                      \
+      const fast_divmod& fdm_C,                      \
+      T* output_data,                                \
+      size_t count)
+
+BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(Pow);
+
+#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \
+  template <typename T, typename T1, typename T2>    \
+  void ImplT2_##name(                                \
+      hipStream_t stream,                           \
+      int32_t output_rank_or_simple_broadcast,       \
+      const TArray<int64_t>* lhs_padded_strides,     \
+      const T1* lhs_data,                            \
+      const TArray<int64_t>* rhs_padded_strides,     \
+      const T2* rhs_data,                            \
+      const TArray<fast_divmod>* fdm_output_strides, \
+      const fast_divmod& fdm_H,                      \
+      const fast_divmod& fdm_C,                      \
+      T* output_data,                                \
+      size_t count)
+
+#define BINARY_OPS2()                    \
+  BINARY_OP_NAME_EXPR2(Greater, (a > b)) \
+  BINARY_OP_NAME_EXPR2(Equal, (a == b))  \
+  BINARY_OP_NAME_EXPR2(Less, (a < b)) \
+  BINARY_OP_NAME_EXPR2(GreaterOrEqual, (a >= b)) \
+  BINARY_OP_NAME_EXPR2(LessOrEqual, (a <= b))
+
+
+#define BINARY_OP_NAME_EXPR2(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name);
+BINARY_OPS2()
+#undef BINARY_OP_NAME_EXPR2
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+// define the device functors that perform the computation on scalars
+
+#define OP_FUNCTOR_DEFINITION(name, expr)                  \
+  template <class T, class T1, class T2>                   \
+  struct OP_##name {                                       \
+    __device__ __inline__ T operator()(T1 a, T2 b) const { \
+      return (expr);                                       \
+    }                                                      \
+  };
+
+#define BINARY_OP_NAME_EXPR(name, expr) \
+  OP_FUNCTOR_DEFINITION(name, expr)
+
+BINARY_OPS()
+
+OP_FUNCTOR_DEFINITION(Pow, _Pow(a, b))
+
+#undef BINARY_OP_NAME_EXPR
+
+#define BINARY_OP_NAME_EXPR2(name, expr) \
+  OP_FUNCTOR_DEFINITION(name, expr)
+
+BINARY_OPS2()
+
+#undef BINARY_OP_NAME_EXPR2
+
+#undef OP_FUNCTOR_DEFINITION
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/math/clip.h"
+#include "core/providers/rocm/math/clip_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(
+    Clip,
+    kOnnxDomain,
+    6,
+    10,
+    float,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Clip_6<float>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Clip,
+    kOnnxDomain,
+    11, 11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Clip);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Clip,
+    kOnnxDomain,
+    12, 12,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", BuildKernelDefConstraints<float, double, MLFloat16, int8_t, uint8_t, int64_t, uint64_t>()),
+    Clip);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Clip,
+    kOnnxDomain,
+    13,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", BuildKernelDefConstraints<float, double, MLFloat16, int8_t, uint8_t, int64_t, uint64_t>()),
+    Clip);
+
+template <typename T>
+Status Clip_6<T>::ComputeInternal(OpKernelContext* ctx) const {
+  const Tensor& X = *ctx->Input<Tensor>(0);
+  const TensorShape& input_shape{X.Shape()};
+  Tensor* Y = ctx->Output(0, input_shape);
+  const size_t count = input_shape.Size();
+  if (count > 0) {
+    auto* y_data = Y->MutableData<T>();
+    const auto* x_data = X.Data<T>();
+    ClipImpl<T>(Stream(), x_data, y_data, nullptr, nullptr, this->min_, this->max_, count);
+  }
+  return Status::OK();
+}
+
+namespace clip_internal {
+template <typename T>
+struct LowMax {
+  constexpr static T low() {
+    return std::numeric_limits<T>::lowest();
+  }
+  constexpr static T max() {
+    return std::numeric_limits<T>::max();
+  }
+};
+
+template <>
+struct LowMax<MLFloat16> {
+  static MLFloat16 low() {
+    return MLFloat16(math::floatToHalf(std::numeric_limits<float>::lowest()));
+  }
+  static MLFloat16 max() {
+    return MLFloat16(math::floatToHalf(std::numeric_limits<float>::max()));
+  }
+};
+}  // namespace clip_internal
+
+template <typename T>
+struct Clip::ComputeImpl {
+  void operator()(hipStream_t stream, const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
+    auto min_default = clip_internal::LowMax<T>::low();
+    auto max_default = clip_internal::LowMax<T>::max();
+
+    const T* min_data = nullptr;
+    const T* max_data = nullptr;
+    // 1-2 Input on CPU
+    if (min) {
+      ORT_ENFORCE(min->Shape().IsScalar(), "min should be a scalar.");
+      min_data = min->Data<T>();
+    }
+
+    if (max) {
+      ORT_ENFORCE(max->Shape().IsScalar(), "max should be a scalar.");
+      max_data = max->Data<T>();
+    }
+
+    const size_t count = X->Shape().Size();
+    if (count > 0) {
+      auto* y_data = Y->MutableData<T>();
+      const auto* x_data = X->Data<T>();
+      ClipImpl<T>(stream, x_data, y_data, min_data, max_data, min_default, max_default, count);
+    }
+  }
+};
+
+Status Clip::ComputeInternal(OpKernelContext* ctx) const {
+  const auto* X = ctx->Input<Tensor>(0);
+  const auto* min = ctx->Input<Tensor>(1);
+  const auto* max = ctx->Input<Tensor>(2);
+  Tensor* Y = ctx->Output(0, X->Shape());
+
+  utils::MLTypeCallDispatcher<float, double, MLFloat16, int8_t, uint8_t, int64_t, uint64_t>
+      t_disp(X->GetElementType());
+
+  t_disp.Invoke<ComputeImpl>(Stream(), X, min, max, Y);
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/rocm/rocm_kernel.h"
+#include "core/providers/cpu/math/clip.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+class Clip_6 final : public onnxruntime::clip_internal::Clip_6Base<T>, public RocmKernel {
+ public:
+  explicit Clip_6(const OpKernelInfo& info) : onnxruntime::clip_internal::Clip_6Base<T>(info), RocmKernel{info} {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+};
+
+// Since version 11. Min and Max are inputs
+// version 12 adds type support
+class Clip final : public RocmKernel {
+ public:
+  explicit Clip(const OpKernelInfo& info) : RocmKernel{info} {
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  template <typename T>
+  struct ComputeImpl;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/math/clip_impl.h"
+#include "core/providers/rocm/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+__global__ void _Clip(const T* input, T* output, const T* min, const T* max, T min_default, T max_default, size_t N) {
+  auto min_val = (min) ? *min : min_default; 
+  auto max_val = (max) ? *max : max_default; 
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  output[id] = (input[id] < min_val) ? min_val : ((input[id] > max_val) ? max_val : input[id]);
+}
+
+template <typename T>
+void ClipImpl(hipStream_t stream, const T* input_data, T* output_data, const T* min, const T* max, T min_default, T max_default, size_t count) {
+  typedef typename ToHipType<T>::MappedType HipT;
+
+  int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
+  union ConstAliasUnion {
+    const T *t;
+    const HipT *rocmT;
+    ConstAliasUnion(const T* _t) { t = _t;}
+  };
+  union AliasUnion {
+    T *t;
+    HipT *rocmT;
+    AliasUnion(T* _t) { t = _t;}
+  };
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(_Clip<HipT>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, ((union ConstAliasUnion)input_data).rocmT,
+                                                                          ((union AliasUnion)output_data).rocmT,
+                                                                          ((union ConstAliasUnion)min).rocmT,
+                                                                          ((union ConstAliasUnion)max).rocmT,
+                                                                          *((union AliasUnion)&min_default).rocmT,
+                                                                          *((union AliasUnion)&max_default).rocmT,
+                                                                          count);
+}
+
+template void ClipImpl<float>(hipStream_t stream, const float* input_data, float* output_data, const float* min, const float* max, float min_default, float max_default, size_t count);
+template void ClipImpl<double>(hipStream_t stream, const double* input_data, double* output_data, const double* min, const double* max, double min_default, double max_default, size_t count);
+template void ClipImpl<MLFloat16>(hipStream_t stream, const MLFloat16* input_data, MLFloat16* output_data, const MLFloat16* min, const MLFloat16* max, MLFloat16 min_default, MLFloat16 max_default, size_t count);
+template void ClipImpl<int8_t>(hipStream_t stream, const int8_t* input_data, int8_t* output_data, const int8_t* min, const int8_t* max, int8_t min_default, int8_t max_default, size_t count);
+template void ClipImpl<uint8_t>(hipStream_t stream, const uint8_t* input_data, uint8_t* output_data, const uint8_t* min, const uint8_t* max, uint8_t min_default, uint8_t max_default, size_t count);
+template void ClipImpl<int64_t>(hipStream_t stream, const int64_t* input_data, int64_t* output_data, const int64_t* min, const int64_t* max, int64_t min_default, int64_t max_default, size_t count);
+template void ClipImpl<uint64_t>(hipStream_t stream, const uint64_t* input_data, uint64_t* output_data, const uint64_t* min, const uint64_t* max, uint64_t min_default, uint64_t max_default, size_t count);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/providers/rocm/math/clip.h"
+#include "core/providers/rocm/rocm_common.h"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+void ClipImpl(hipStream_t stream, const T* input_data, T* output_data, const T* min, const T* max, T min_default, T max_default, size_t count);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "cumsum.h"
+#include "cumsum_impl.h"
+#include "core/providers/cpu/math/cumsum.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    CumSum,
+    kOnnxDomain,
+    11, 13,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)  // 'axis' needs to be on CPU
+        .TypeConstraint("T", std::vector<MLDataType>{
+                                 DataTypeImpl::GetTensorType<int32_t>(),
+                                 DataTypeImpl::GetTensorType<int64_t>(),
+                                 DataTypeImpl::GetTensorType<uint32_t>(),
+                                 DataTypeImpl::GetTensorType<uint64_t>(),
+                                 DataTypeImpl::GetTensorType<float>(),
+                                 DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(), DataTypeImpl::GetTensorType<int64_t>()}),
+    CumSum);
+
+ONNX_OPERATOR_KERNEL_EX(
+    CumSum,
+    kOnnxDomain,
+    14,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 1)  // 'axis' needs to be on CPU
+        .TypeConstraint("T", std::vector<MLDataType>{
+                                 DataTypeImpl::GetTensorType<int32_t>(),
+                                 DataTypeImpl::GetTensorType<int64_t>(),
+                                 DataTypeImpl::GetTensorType<uint32_t>(),
+                                 DataTypeImpl::GetTensorType<uint64_t>(),
+                                 DataTypeImpl::GetTensorType<float>(),
+                                 DataTypeImpl::GetTensorType<double>(),
+                                 DataTypeImpl::GetTensorType<MLFloat16>()})  // MLFloat16 is added in opset 14
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(), DataTypeImpl::GetTensorType<int64_t>()}),
+    CumSum);
+
+Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
+  const Tensor* input = ctx->Input<Tensor>(0);                       // input tensor
+  auto rank = static_cast<int64_t>(input->Shape().NumDimensions());  // the rank of the input/output
+  if (rank == 0)
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot apply CumSum operator on a scalar");
+
+  const Tensor* axis_tensor = ctx->Input<Tensor>(1);  // axis input tensor
+
+  int64_t axis = 0;
+  ORT_THROW_IF_ERROR(cumsum_op::GetAxis(axis_tensor, rank, axis));
+
+  TensorShape output_shape(input->Shape());
+  auto& output = *ctx->Output(0, output_shape);  // output tensor
+
+  // output tensor's size is 0, nothing to fill - return
+  if (output_shape.Size() == 0)
+    return Status::OK();
+
+  const auto& input_dims = input->Shape().GetDims();
+
+  int64_t current_dim = rank - 1;
+  int64_t input_stride_along_axis = 1;
+
+  // axis (and by extension current_dim) can never be negative as this is validated much before
+  // so no need to add the extra check to make sure current_dim is within bounds of the vector size
+  while (current_dim > axis) {
+    input_stride_along_axis *= input_dims[current_dim--];
+  }
+
+  fast_divmod fast_divmod_input_dim_along_axis(static_cast<int>(input_dims[axis]));
+  fast_divmod fast_divmod_input_stride_along_axis(static_cast<int>(input_stride_along_axis));
+
+  if (input->IsDataType<float>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<float>::MappedType*>(input->Data<float>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<float>::MappedType*>(output.MutableData<float>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else if (input->IsDataType<double>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<double>::MappedType*>(input->Data<double>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<double>::MappedType*>(output.MutableData<double>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else if (input->IsDataType<int32_t>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<int32_t>::MappedType*>(input->Data<int32_t>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<int32_t>::MappedType*>(output.MutableData<int32_t>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else if (input->IsDataType<int64_t>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<int64_t>::MappedType*>(input->Data<int64_t>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<int64_t>::MappedType*>(output.MutableData<int64_t>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else if (input->IsDataType<uint32_t>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<uint32_t>::MappedType*>(input->Data<uint32_t>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<uint32_t>::MappedType*>(output.MutableData<uint32_t>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else if (input->IsDataType<uint64_t>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<uint64_t>::MappedType*>(input->Data<uint64_t>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<uint64_t>::MappedType*>(output.MutableData<uint64_t>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else if (input->IsDataType<MLFloat16>()) {
+    CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<MLFloat16>::MappedType*>(input->Data<MLFloat16>()),
+               fast_divmod_input_dim_along_axis,
+               fast_divmod_input_stride_along_axis,
+               reinterpret_cast<typename ToHipType<MLFloat16>::MappedType*>(output.MutableData<MLFloat16>()),
+               output_shape.Size(),
+               exclusive_,
+               reverse_);
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported input data type to the CumSum op: ",
+                           input->DataType());
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "core/common/common.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+class CumSum final : public RocmKernel {
+ public:
+  explicit CumSum(const OpKernelInfo& info) : RocmKernel(info) {
+    // Process exclusive attribute
+    int64_t exclusive = 0;
+    auto status = info.GetAttr("exclusive", &exclusive);
+    if (status.IsOK()) {
+      if (exclusive == 1 || exclusive == 0) {
+        exclusive_ = (exclusive == 1);
+      } else {
+        ORT_ENFORCE("attribute exclusive can only be 0 or 1");
+      }
+    }
+
+    // Process reverse attribute
+    int64_t reverse = 0;
+    status = info.GetAttr("reverse", &reverse);
+    if (status.IsOK()) {
+      if (reverse == 1 || reverse == 0) {
+        reverse_ = (reverse == 1);
+      } else {
+        ORT_ENFORCE("attribute reverse can only be 0 or 1");
+      }
+    }
+  }
+
+  ~CumSum() = default;
+
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+
+ private:
+  bool exclusive_ = false;
+  bool reverse_ = false;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/rocm/cu_inc/common.cuh"
+#include "core/providers/rocm/shared_inc/fast_divmod.h"
+
+#include "cumsum_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+__global__ void _CumSumKernel(
+    const T* input_data,
+    const fast_divmod fast_divmod_input_dim_along_axis,
+    const fast_divmod fast_divmod_input_stride_along_axis,
+    T* output_data,
+    const int64_t output_size,
+    const bool exclusive,
+    const bool reverse) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(indices_index, output_size);
+
+  int input_dim_along_axis = fast_divmod_input_dim_along_axis.d_;
+  int input_stride_along_axis = fast_divmod_input_stride_along_axis.d_;
+
+  int axis_dim = 0;
+  int div = fast_divmod_input_stride_along_axis.div(static_cast<int>(indices_index));
+  fast_divmod_input_dim_along_axis.divmod(div, div, axis_dim);
+
+  int start = 0;
+  int end = 0;
+
+  if (!reverse && !exclusive) {
+    start = 0;
+    end = axis_dim;
+  
+  } else if (reverse && !exclusive) {
+    start = axis_dim;
+    end = input_dim_along_axis - 1;
+
+  } else if (!reverse && exclusive) {
+    start = 0;
+    end = axis_dim - 1;
+
+  } else { // reverse && exclusive
+    start = axis_dim + 1;
+    end = input_dim_along_axis - 1;
+
+  }
+
+  // count the number of elements to accumulate the sum
+  int count = end - start + 1;
+  if (count <= 0) {
+    output_data[indices_index] = 0;
+    return;  
+  }
+
+  // adjust start index based on the above identified start dim value along the axis of interest
+  int data_index = static_cast<int>(indices_index) + (start - axis_dim) * input_stride_along_axis;
+  T sum = 0;
+
+  // keep accumulating values from the start index for 'count' times and skip appropriately 
+  while (count != 0) {
+    sum += input_data[data_index];
+    data_index += input_stride_along_axis;
+    --count;
+  }
+
+  output_data[indices_index] = sum;
+}
+
+template <typename T>
+void CumSumImpl(
+    hipStream_t stream,
+    const T* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    T* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse) {
+  if (output_size > 0) {
+    int blocksPerGrid = static_cast<int>((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(_CumSumKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, input_data,
+                                                                        input_dim_along_axis,
+                                                                        input_stride_along_axis,
+                                                                        output_data,
+                                                                        output_size,
+                                                                        exclusive,
+                                                                        reverse);
+  }
+}
+
+template void CumSumImpl<int32_t>(
+    hipStream_t stream,
+    const int32_t* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    int32_t* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+template void CumSumImpl<int64_t>(
+    hipStream_t stream,
+    const int64_t* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    int64_t* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+template void CumSumImpl<uint32_t>(
+    hipStream_t stream,
+    const uint32_t* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    uint32_t* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+template void CumSumImpl<uint64_t>(
+    hipStream_t stream,
+    const uint64_t* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    uint64_t* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+template void CumSumImpl<float>(
+    hipStream_t stream,
+    const float* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    float* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+template void CumSumImpl<double>(
+    hipStream_t stream,
+    const double* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    double* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+template void CumSumImpl<half>(
+    hipStream_t stream,
+    const half* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    half* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+}  // namespace rocm
+}  // namespace onnxruntime
+
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T>
+void CumSumImpl(
+    hipStream_t stream,
+    const T* input_data,
+    const fast_divmod& input_dim_along_axis,
+    const fast_divmod& input_stride_along_axis,
+    T* output_data,
+    int64_t output_size,
+    bool exclusive,
+    bool reverse);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/gemm.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/gemm.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+class Gemm final : public RocmKernel {
+  using Base = RocmKernel;
+
+ public:
+  Gemm(const OpKernelInfo& info) : RocmKernel(info) {
+    int64_t temp;
+    ORT_ENFORCE(info.GetAttr<int64_t>("transA", &temp).IsOK());
+    trans_A_ = (temp != 0);
+
+    ORT_ENFORCE(info.GetAttr<int64_t>("transB", &temp).IsOK());
+    trans_B_ = (temp != 0);
+
+    ORT_ENFORCE(info.GetAttr<float>("alpha", &alpha_).IsOK());
+    ORT_ENFORCE(info.GetAttr<float>("beta", &beta_).IsOK());
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  bool trans_A_;
+  bool trans_B_;
+  float alpha_;
+  float beta_;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+template <typename T>
+class MatMul final : public RocmKernel {
+  using Base = RocmKernel;
+
+ public:
+  MatMul(const OpKernelInfo& info)
+      : RocmKernel(info),
+        alpha_{info.GetAttrOrDefault<float>("alpha", 1.0f)},
+        trans_A_{info.GetAttrOrDefault<int64_t>("transA", 0) != 0},
+        trans_B_{info.GetAttrOrDefault<int64_t>("transB", 0) != 0},
+        trans_batch_a_{info.GetAttrOrDefault<int64_t>("transBatchA", 0) != 0},
+        trans_batch_b_{info.GetAttrOrDefault<int64_t>("transBatchB", 0) != 0} {}
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  const float alpha_;
+  const bool trans_A_;
+  const bool trans_B_;
+  const bool trans_batch_a_;
+  const bool trans_batch_b_;
+};
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "matmul_integer.h"
+#include "matmul_integer.cuh"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/rocm/shared_inc/fpgeneric.h"
+#include "core/providers/rocm/shared_inc/integer_gemm.h"
+#include "core/providers/rocm/rocm_allocator.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(
+    MatMulInteger,
+    kOnnxDomain,
+    10,
+    int8_t,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPUInput, 2)
+        .InputMemoryType(OrtMemTypeCPUInput, 3)
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<int8_t>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>())
+        .TypeConstraint("T3", DataTypeImpl::GetTensorType<int32_t>()),
+    MatMulInteger<int8_t, int8_t>);
+
+template <>
+Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) const {
+  auto a = ctx->Input<Tensor>(0);
+  auto b = ctx->Input<Tensor>(1);
+  ORT_ENFORCE(a != nullptr && b != nullptr);
+
+  MatMulComputeHelper helper;
+  ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
+  Tensor* Y = ctx->Output(0, helper.OutputShape());
+
+  // Bail out early if the output is going to be empty
+  if (Y->Shape().Size() == 0)
+    return Status::OK();
+
+  const int8_t* a_ptr = a->Data<int8_t>();
+  const int8_t* b_ptr = b->Data<int8_t>();
+  int32_t* output_ptr = Y->MutableData<int32_t>();
+
+  // validate zero points
+  int8_t a_offset = 0;
+  int8_t b_offset = 0;
+  if (has_a_zero_point_) {
+    auto a_zero_point = ctx->Input<Tensor>(2);
+    ORT_ENFORCE(IsScalarOr1ElementVector(a_zero_point),
+                "MatmulInteger : input1 zero point must be a scalar or 1D tensor of size 1");
+    a_offset = *(a_zero_point->Data<int8_t>());
+  }
+  if (has_b_zero_point_) {
+    auto b_zero_point = ctx->Input<Tensor>(3);
+    ORT_ENFORCE(IsScalarOr1ElementVector(b_zero_point),
+                "MatmulInteger : input2 zero point must be a scalar or 1D tensor of size 1");
+    b_offset = *(b_zero_point->Data<int8_t>());
+  }
+
+  // offset output c[i,j] to
+  // k*a_offset*b_offset -
+  // b_offset * (a[i,0] + a[i,1] ...+a[i,k]) -
+  // a_offset * (b[0,j] + b[1,j] ... + b[k,j])
+  // ReduceRowSumOnMatrixA computes the b_offset * (a[i,0] + a[i,1] ...+a[i,k]) part
+  // ReduceColSumOnMatrixB computes the a_offset * (b[0,j] + b[1,j] ... + b[k,j]) part
+  // OffsetOutput computes gets the final result
+  IAllocatorUniquePtr<int32_t> a_row_buf;
+  if (b_offset != 0) {
+    a_row_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.N());
+    ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(Stream(), a_ptr, a_row_buf.get(), b_offset, helper));
+  }
+
+  IAllocatorUniquePtr<int32_t> b_col_buf;
+  if (a_offset != 0) {
+    b_col_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.M());
+    ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(Stream(), b_ptr, b_col_buf.get(), a_offset, helper));
+  }
+
+  int alpha = 1;
+  int beta = 0;
+  if (a_offset != 0 || b_offset != 0) {
+    ORT_RETURN_IF_ERROR(OffsetOutput(Stream(),
+                                     a_row_buf.get(),
+                                     b_col_buf.get(),
+                                     output_ptr,
+                                     a_offset,
+                                     b_offset,
+                                     helper));
+    beta = 1;
+  }
+
+  for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
+    ORT_RETURN_IF_ERROR(GemmInt8(static_cast<int>(helper.M()),
+                                 static_cast<int>(helper.N()),
+                                 static_cast<int>(helper.K()),
+                                 alpha,
+                                 beta,
+                                 a_ptr + helper.LeftOffsets()[batch],
+                                 static_cast<int>(helper.K()),
+                                 b_ptr + helper.RightOffsets()[batch],
+                                 static_cast<int>(helper.N()),
+                                 output_ptr + helper.OutputOffsets()[batch],
+                                 static_cast<int>(helper.N()),
+                                 this));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cu
+#include "hip/hip_runtime.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "matmul_integer.cuh"
+
+#include <hipcub/hipcub.hpp>
+#include "core/providers/rocm/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <int TPB>
+__global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_sum, const int8_t offset, int32_t K) {
+  int32_t thread_data = 0;
+  const int8_t* row_ptr = matrix + blockIdx.x * K;
+  for (int i = threadIdx.x; i < K; i += TPB) {
+    thread_data += *(row_ptr + i);
+  }
+
+  using BlockReduce = hipcub::BlockReduce<int32_t, TPB>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int32_t sum = BlockReduce(temp_storage).Sum(thread_data);
+
+  if (threadIdx.x == 0) {
+    row_sum[blockIdx.x] = offset * sum;
+  }
+}
+
+Status ReduceRowSumOnMatrixA(hipStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
+  for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
+    ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.LeftOffsets()[batch],
+                                                                                                                                                 row_sum + batch * helper.M(),
+                                                                                                                                                 offset,
+                                                                                                                                                 static_cast<int>(helper.K()));
+  }
+
+  return HIP_CALL(hipGetLastError());
+}
+
+template <int TPB>
+__global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_sum, const int8_t offset, int32_t row, int32_t col) {
+  int32_t thread_data = 0;
+  const int8_t* col_ptr = matrix + blockIdx.x;
+  for (int i = threadIdx.x; i < row; i += TPB) {
+    thread_data += *(col_ptr + i * col);
+  }
+
+  using BlockReduce = hipcub::BlockReduce<int32_t, TPB>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  int32_t sum = BlockReduce(temp_storage).Sum(thread_data);
+
+  if (threadIdx.x == 0) {
+    col_sum[blockIdx.x] = offset * sum;
+  }
+}
+
+Status ReduceColSumOnMatrixB(hipStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
+  for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
+    ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.RightOffsets()[batch],
+                                                                                                                                                 col_sum + batch * helper.N(),
+                                                                                                                                                 offset,
+                                                                                                                                                 static_cast<int32_t>(helper.K()),
+                                                                                                                                                 static_cast<int32_t>(helper.N()));
+  }
+
+  return HIP_CALL(hipGetLastError());
+}
+
+__global__ void ComputeOffsetOfMatrixAB(const int32_t* row_sum,
+                                        const int32_t* col_sum,
+                                        int32_t* output,
+                                        int32_t K_A_B,
+                                        int32_t N) {
+  for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
+    *(output + blockIdx.x * N + i) = K_A_B - row_sum[blockIdx.x] - col_sum[i];
+  }
+}
+
+__global__ void ComputeOffsetOfMatrixA(const int32_t* col_sum,
+                                       int32_t* output,
+                                       int32_t N) {
+  for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
+    *(output + blockIdx.x * N + i) = -col_sum[i];
+  }
+}
+
+__global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum,
+                                       int32_t* output,
+                                       int32_t N) {
+  for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
+    *(output + blockIdx.x * N + i) = -row_sum[blockIdx.x];
+  }
+}
+
+Status OffsetOutput(hipStream_t stream,
+                    const int32_t* row_sum,
+                    const int32_t* col_sum,
+                    int32_t* output,
+                    const int8_t a_offset,
+                    const int8_t b_offset,
+                    const MatMulComputeHelper& helper) {
+  if (a_offset && b_offset) {
+    for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
+      ComputeOffsetOfMatrixAB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
+          row_sum + batch * helper.M(),
+          col_sum + batch * helper.N(),
+          output + helper.OutputOffsets()[batch],
+          static_cast<int32_t>(helper.K()) * a_offset * b_offset,
+          static_cast<int32_t>(helper.N()));
+    }
+  } else if (a_offset) {
+    for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
+      ComputeOffsetOfMatrixA<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
+          col_sum + batch * helper.N(),
+          output + helper.OutputOffsets()[batch],
+          static_cast<int32_t>(helper.N()));
+    }
+  } else if (b_offset) {
+    for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
+      ComputeOffsetOfMatrixB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
+          row_sum + batch * helper.M(),
+          output + helper.OutputOffsets()[batch],
+          static_cast<int32_t>(helper.N()));
+    }
+  }
+
+  return HIP_CALL(hipGetLastError());
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cuh
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "matmul_integer.h"
+#include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/rocm/rocm_common.h"
+#include "core/providers/rocm/shared_inc/rocm_utils.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+Status ReduceRowSumOnMatrixA(hipStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
+Status ReduceColSumOnMatrixB(hipStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
+Status OffsetOutput(hipStream_t stream,
+                    const int32_t* row_sum,
+                    const int32_t* col_sum,
+                    int32_t* output,
+                    const int8_t a_offset,
+                    const int8_t b_offset,
+                    const MatMulComputeHelper& helper);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+template <typename T1, typename T2>
+class MatMulInteger final : public RocmKernel {
+  using Base = RocmKernel;
+
+ public:
+  MatMulInteger(const OpKernelInfo& info) : RocmKernel(info) {
+    has_a_zero_point_ = false;
+    has_b_zero_point_ = false;
+    if (info.GetInputCount() > 2) {
+      has_a_zero_point_ = true;
+    }
+    if (info.GetInputCount() > 3) {
+      has_b_zero_point_ = true;
+    }
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  bool has_a_zero_point_;
+  bool has_b_zero_point_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/gsl.h"
+#include "core/providers/rocm/rocm_kernel.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+template <typename T, bool is_log_softmax>
+Status SoftMaxComputeHelper(
+    hipStream_t stream,
+    const T* input,
+    const TensorShape& shape,
+    T* Y,
+    int64_t axis);
+
+template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
+void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const input_t* src,
+                                       int softmax_elements, int softmax_elements_stride, int batch_count);
+
+template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
+void dispatch_blockwise_softmax_forward(hipStream_t stream, output_t* output, const input_t* input,
+                                        int softmax_elements, int input_stride, int output_stride, int batch_count);
+
+template <typename T>
+class Softmax final : public RocmKernel {
+ public:
+  Softmax(const OpKernelInfo& info) : RocmKernel{info} {
+    const auto& node = info.node();
+    opset_ = node.SinceVersion();
+
+    int64_t axis;
+    Status status = info.GetAttr<int64_t>("axis", &axis);
+
+    if (status.IsOK()) {
+      axis_ = gsl::narrow_cast<int>(axis);
+    } else {
+      if (opset_ < 13) {
+        axis_ = 1;  // opset-12 and below, the default axis value is 1
+      } else {
+        axis_ = -1;  // opset-13, the default axis value is -1
+      }
+    }
+
+    log_softmax_ = info.GetKernelDef().OpName() == "LogSoftmax";
+
+    // We need to cast away the const as PerThreadRocblasHandle() is currently a non-const method
+    // TODO: Clean up the ROCMExecutionProvider interface to avoid this
+    rocm_ep_ = const_cast<ROCMExecutionProvider*>(
+        static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider()));
+  }
+
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  int64_t axis_;
+  bool log_softmax_;
+  int opset_;
+
+  // We need to access to the ROCM EP instance to get the rocblas handle to use
+  // for transposing(if applicable)
+  ROCMExecutionProvider* rocm_ep_;
+};
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_blockwise_impl.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_blockwise_impl.cuh
+#include "hip/hip_runtime.h"
+
+/**
+* Copyright (c) 2016-present, Facebook, Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+// The code below is mostly copied from Pytorch SoftMax.cuh
+
+#pragma once
+#include "core/providers/rocm/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace rocm {
+
+constexpr int ALIGN_BYTES = 16;
+const int max_threads = 1024;
+
+dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
+
+  // In the vectorized case we want to trade off allowing more of the buffers to be accessed
+  // in a vectorized way against wanting a larger block size to get better utilisation.
+  // In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
+  // of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
+  // allowing a larger block size.
+  if (ILP > 1) {
+    max_block_size /= 2;
+  }
+
+  while (block_size < (max_block_size)) block_size *= 2;
+  // Launch at least a single warp - the kernel assumes that.
+  block_size = std::max(block_size, static_cast<uint64_t>(GPU_WARP_SIZE_HOST));
+  return dim3(static_cast<unsigned int>(block_size));
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Regular kernel (fast when dim_size is large; requires inner_size == 1)
+////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename T, typename AccumT>
+struct MaxFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
+    return ::max(max, (AccumT)v);
+  }
+};
+
+template<typename T, typename AccumT>
+struct AddFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + (AccumT)v;
+  }
+};
+
+template<typename T, typename AccumT>
+struct SumExpFloat
+{
+  __device__ __forceinline__ SumExpFloat(AccumT v)
+    : max_k(v) {}
+
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + expf((AccumT)v - max_k);
+  }
+
+  const AccumT max_k;
+};
+
+template <template<typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+            const Reduction<AccumT>& r,
+            AccumT defaultVal)
+{
+  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
+  __syncthreads();
+
+  smem[threadIdx.x] = val;
+
+  __syncthreads();
+
+  AccumT warpVal = defaultVal;
+
+  // First warp will perform per-warp reductions for the remaining warps
+  uint32_t mask = (((uint64_t)1) << (blockDim.x / GPU_WARP_SIZE)) - 1;
+  if (threadIdx.x < GPU_WARP_SIZE) {
+    int lane = threadIdx.x % GPU_WARP_SIZE;
+    if (lane < blockDim.x / GPU_WARP_SIZE) {
+#pragma unroll
+      for (int i = 0; i < GPU_WARP_SIZE; ++i) {
+        warpVal = r(warpVal, smem[lane * GPU_WARP_SIZE + i]);
+      }
+#if !defined(USE_ROCM)
+      __syncwarp(mask);
+#endif
+      smem[lane] = warpVal;
+    }
+  }
+
+  __syncthreads();
+
+  // First thread will perform a reduction of the above per-warp reductions
+  AccumT blockVal = defaultVal;
+
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < blockDim.x / GPU_WARP_SIZE; ++i) {
+      blockVal = r(blockVal, smem[i]);
+    }
+    smem[0] = blockVal;
+  }
+
+  // Sync and broadcast
+  __syncthreads();
+  return smem[0];
+}
+
+
+template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ AccumT
+ilpReduce(int shift,
+          T* data,
+          int size,
+          const Reduction<T, AccumT>& r,
+          AccumT defaultVal)
+{
+  using LoadT = aligned_vector<T, ILP>;
+  AccumT threadVal = defaultVal;
+  int offset = threadIdx.x;
+
+  // shift and do 1
+  if(shift > 0){
+    data -= shift;
+    size += shift;
+    if(threadIdx.x >= shift){
+      threadVal = r(threadVal, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
+  int last = size % (ILP * blockDim.x);
+
+  T v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *value = reinterpret_cast<LoadT*>(data)[offset];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      threadVal = r(threadVal, v[j]);
+    }
+  }
+
+  offset = size - last + threadIdx.x;
+  // Epilogue
+  for (; offset < size; offset += blockDim.x)
+    threadVal = r(threadVal, data[offset]);
+
+  return threadVal;
+}
+
+/**
+ * This will apply the Epilogue with vectorized reads & writes when input & output have the same shift
+ */
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__device__ __forceinline__ void
+WriteFpropResultsVectorized(
+             int size,
+             const int shift,
+             scalar_t *input,
+             outscalar_t *output,
+             Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  using LoadT = aligned_vector<scalar_t, ILP>;
+  using StoreT = aligned_vector<outscalar_t, ILP>;
+
+  int offset = threadIdx.x;
+
+  // if unaligned, do one value / thread and move on, guaranteeing aligned reads/writes later
+  if (shift > 0) {
+    input -= shift;
+    output -= shift;
+    size += shift;
+
+    if (threadIdx.x >= shift) {
+      output[offset] = epilogue(input[offset]);
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+    output += blockDim.x;
+  }
+
+  const int last = size % (ILP * blockDim.x);
+
+  scalar_t in_v[ILP];
+  LoadT* in_value = reinterpret_cast<LoadT*>(&in_v);
+
+  outscalar_t out_v[ILP];
+  StoreT* out_value = reinterpret_cast<StoreT*>(&out_v);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *in_value = reinterpret_cast<LoadT*>(input)[offset];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      out_v[j] = epilogue(in_v[j]);
+    }
+
+    reinterpret_cast<StoreT*>(output)[offset] = *out_value;
+  }
+
+  offset = size - last + threadIdx.x;
+  // handle the tail
+  for (; offset < size; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+
+/**
+ * This will apply the Epilogue with non-vectrorized reads & writes for the general case
+ */
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__device__ __forceinline__ void
+WriteFpropResults(
+             int classes,
+             scalar_t *input,
+             outscalar_t *output,
+             Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  int offset = threadIdx.x;
+
+  int last = classes % (ILP * blockDim.x);
+
+  // Main bulk of loop with ILP
+  for (; offset < classes - last; offset += blockDim.x * ILP) {
+    scalar_t tmp[ILP];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      tmp[j] = input[offset + j * blockDim.x];
+    }
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      output[offset + j * blockDim.x] = epilogue(tmp[j]);
+    }
+  }
+
+  // Remainder - no ILP
+  for (; offset < classes; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
+  template <typename, typename, typename> class Epilogue>
+__global__ void
+softmax_block_forward(outscalar_t* output, scalar_t* input, int classes, int input_stride, int output_stride) {
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+
+  using LoadT = aligned_vector<scalar_t, ILP>;
+  using StoreT = aligned_vector<outscalar_t, ILP>;
+
+  // forward pointers to batch[blockIdx.x]
+  // each block handles a sample in the mini-batch
+  input += blockIdx.x * input_stride;
+  output += blockIdx.x * output_stride;
+
+  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
+  const int output_shift = ((uint64_t)output) % ALIGN_BYTES / sizeof(outscalar_t);
+
+  // find the max
+  accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
+      shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -std::numeric_limits<accscalar_t>::max());
+  accscalar_t max_k = blockReduce<Max, accscalar_t>(
+      sdata, threadMax, Max<accscalar_t>(), -std::numeric_limits<accscalar_t>::max());
+
+  // reduce all values
+  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
+      shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = blockReduce<Add, accscalar_t>(
+      sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  if (shift == output_shift) {
+    WriteFpropResultsVectorized<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, shift, input, output, epilogue);
+  } else {
+    WriteFpropResults<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, input, output, epilogue);
+  }
+}
+
+template<typename T, typename AccumT, typename OutT>
+struct LogSoftMaxForwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : max_input(max_input),  logsum(logf(sum)) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>((AccumT)input - max_input - logsum);
+}
+
+  const AccumT max_input;
+  const AccumT logsum;
+};
+
+template<typename T, typename AccumT, typename OutT>
+struct SoftMaxForwardEpilogue {
+  __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : max_input(max_input)
+    , sum(sum) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>(expf((AccumT)input - max_input) / sum);
+  }
+
+  const AccumT max_input;
+  const AccumT sum;
+};
+
+}
+}
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_common.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_common.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/status.h"
+#include "core/providers/rocm/miopen_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+Status SoftmaxForward(miopenHandle_t miopen_handle, const void* alpha, const miopenTensorDescriptor_t input_tensor,
+                      const void* input_data, const void* beta, const miopenTensorDescriptor_t output_tensor,
+                      void* output_data);
+
+Status SoftmaxBackward(miopenHandle_t miopen_handle, bool is_log_softmax, const void* alpha,
+                       const miopenTensorDescriptor_t input_tensor, const void* output_data,
+                       const void* output_grad_data, const void* beta, const miopenTensorDescriptor_t output_tensor,
+                       void* input_grad_data);
+
+}  // namespace rocm
+}  // namespace onnxruntime
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/topk.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/topk.cc
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "topk.h"
+#include "topk_impl.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    TopK,
+    kOnnxDomain,
+    1, 9,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    TopK<false>);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    TopK,
+    kOnnxDomain,
+    10, 10,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),
+    TopK<true>);
+
+ONNX_OPERATOR_KERNEL_EX(
+    TopK,
+    kOnnxDomain,
+    11,
+    kRocmExecutionProvider,
+    (*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),
+    TopK<true>);
+
+template <bool inputk>
+TopK<inputk>::TopK(const OpKernelInfo& info) : RocmKernel(info) {
+  info.GetAttrOrDefault<int64_t>("axis", &axis_, -1);
+  info.GetAttrOrDefault<int64_t>("largest", &largest_, 1);
+  info.GetAttrOrDefault<int64_t>("sorted", &sorted_, 1);
+  if (!inputk) {
+    info.GetAttrOrDefault<int64_t>("k", &K_, 0);
+  }
+}
+
+#define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType<T>(prim_type)
+#define TOPKIMPL(T) TopKImpl<T>(this, stream, tensor_X->Data<T>(),                 \
+                                static_cast<T*>(tensor_V->MutableDataRaw()),       \
+                                static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
+                                elem_nums_rocm,                                    \
+                                elem_nums.size(),                                  \
+                                axis, K_, largest_, sorted_, N, dimension)
+
+template <bool inputk>
+Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
+  auto tensor_X = ctx->Input<Tensor>(0);
+  ORT_ENFORCE(nullptr != tensor_X);
+  int32_t rank = static_cast<int32_t>(tensor_X->Shape().NumDimensions());
+  int32_t axis = static_cast<int32_t>(axis_ < 0 ? rank + axis_ : axis_);
+  ORT_ENFORCE(axis > -1 && axis < rank);
+
+  if (inputk) {
+    auto tensor_K = ctx->Input<Tensor>(1);
+    ORT_ENFORCE(nullptr != tensor_K);
+    K_ = *tensor_K->Data<int64_t>();
+    ORT_ENFORCE(K_ >= 0 && K_ <= tensor_X->Shape().GetDims()[axis]);
+  }
+
+  auto output_shape = tensor_X->Shape();
+  output_shape[axis] = K_;
+  auto tensor_V = ctx->Output(0, output_shape);
+  auto tensor_I = ctx->Output(1, output_shape);
+
+  if (0 == K_) {
+    return Status::OK();
+  }
+
+  auto elem_nums = tensor_X->Shape().AsShapeVector();
+  auto dimension = elem_nums[axis];
+  for (auto i = static_cast<int64_t>(elem_nums.size()) - 2; i >= 0; --i) {
+    elem_nums[i] *= elem_nums[i + 1];
+  }
+
+  auto N = elem_nums[0] / dimension;
+  TArray<int64_t> elem_nums_rocm(elem_nums);
+
+  auto prim_type = tensor_X->DataType()->AsPrimitiveDataType();
+  if (prim_type == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
+  }
+
+  hipStream_t stream = this->Stream();
+  if (IS_PRIM_TYPE(uint8_t)) return TOPKIMPL(uint8_t);
+  if (IS_PRIM_TYPE(uint16_t)) return TOPKIMPL(uint16_t);
+  if (IS_PRIM_TYPE(uint32_t)) return TOPKIMPL(uint32_t);
+  if (IS_PRIM_TYPE(uint64_t)) return TOPKIMPL(uint64_t);
+  if (IS_PRIM_TYPE(int8_t)) return TOPKIMPL(int8_t);
+  if (IS_PRIM_TYPE(int16_t)) return TOPKIMPL(int16_t);
+  if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t);
+  if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t);
+  if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16);
+  if (IS_PRIM_TYPE(float)) return TOPKIMPL(float);
+  if (IS_PRIM_TYPE(double)) return TOPKIMPL(double);
+  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
+}
+
+}  // namespace rocm
+}  // namespace onnxruntime