lite

25d7fde8 · gaoqiong · 8439d29f · 25d7fde8 · 25d7fde8 · 25d7fde8
Commit 25d7fde8 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.cu
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/object_detection/roialign_impl.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cc
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu
@@ -440,7 +440,7 @@ Status call_reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* outpu
    HIP_RETURN_IF_ERROR(hipMemsetAsync(output, 0, n * sizeof(TOut), stream));
  }
-  constexpr int max_num_threads_in_block = 512;
+  constexpr int max_num_threads_in_block = 256;
  constexpr int max_num_blocks_in_grid = 512;
  constexpr int load_count_per_thread = 4;

--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_ops.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_ops.h
@@ -61,6 +61,8 @@ class ReduceKernel : public RocmKernel, public ReduceKernelBase<allow_multi_axes
  template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
  Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
  template <typename T, typename OutT, miopenReduceTensorIndices_t ReduceTensorIndices>
  Status ReduceKernelShared(
      const T* X,

--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_provider_factory_creator.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_provider_factory_creator.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/accumulation_type.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/accumulation_type.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc