lite

25d7fde8 · gaoqiong · 8439d29f · 25d7fde8 · 25d7fde8 · 25d7fde8
Commit 25d7fde8 authored Jul 25, 2023 by gaoqiong
20 changed files
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
--- a/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
+++ b/build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -9,6 +9,7 @@ or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
 __version__ = "1.14.0"
 __author__ = "Microsoft"
+__dcu_version__ = "1.14.0+git556e6af.abi0.dtk2304"

 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
 # in order to know whether the onnxruntime package is for training it needs

--- a/onnxruntime/core/providers/cpu/nn/batch_norm.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm.h
@@ -191,7 +191,7 @@ class BatchNorm : public OpKernel {
    EigenArrayMap<T> Y_arr(Y->MutableData<T>(),
                           is_spatial_ ? sample_size : sample_size_incl_all_channels,
                           is_spatial_ ? N * C : N);
-
+    
    if (is_spatial_) {  // spatial == 1
      for (size_t nc = 0; nc < N * C; ++nc) {
        Y_arr.col(nc) = X_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);

--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -73,6 +73,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
  const size_t kernel_rank = kernel_shape.size();

  BufferUniquePtr col_buffer;
+  //printf("***********<typename T>***********");

  // Pointwise convolutions can use the original input tensor in place,
  // otherwise a temporary buffer is required for the im2col transform.
@@ -126,6 +127,13 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
        }
      }

+      // std::cout<<"col_buffer_data:"<<std::endl;
+      // for(int i=0;i<200;++i)
+      // {
+      //   printf("%f ",col_buffer_data[i]);
+      // }
+      // printf("\n");
+
      math::Gemm<T>(
          CblasNoTrans,
          CblasNoTrans,
@@ -163,6 +171,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
  const int64_t C = X->Shape()[1];
  const int64_t M = W->Shape()[0];
  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
+  //printf("##############float###############");

  // kernel_shape is an optional attribute and has to be inferred from W if not provided
  TensorShapeVector kernel_shape;

--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@@ -21,6 +21,7 @@
 #include "core/common/safeint.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
+#include <iostream>

 namespace onnxruntime {


--- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -28,6 +28,14 @@ struct PoolAttributes {
                 const std::string& op_name, int start_version)
      : global_pooling(IsGlobalPooling(op_name)) {
    if (global_pooling) {
+      if(op_name == "GlobalAveragePool") {
+        global_average_pooling=true;
+      }
+      
+      if(op_name == "GlobalMaxPool") 
+      {
+        global_max_pooling=true;
+      }
      return;
    }

@@ -62,11 +70,13 @@ struct PoolAttributes {

    if (op_name == "AveragePool") {
      int64_t temp;
+      average_pooling=true;
      ORT_ENFORCE(info.GetAttr<int64_t>("count_include_pad", &temp).IsOK());
      count_include_pad = (temp != 0);
    }

    if (op_name == "MaxPool") {
+      max_pooling= true;
      if (start_version >= 8) {
        ORT_ENFORCE(info.GetAttr("storage_order", &storage_order).IsOK());
      }
@@ -84,6 +94,10 @@ struct PoolAttributes {
  }

  const bool global_pooling;
+  bool max_pooling=false;
+  bool global_max_pooling=false;
+  bool average_pooling=false;
+  bool global_average_pooling=false;

  bool count_include_pad{};
  int64_t storage_order{0};  // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.

--- a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
@@ -191,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl(

  #ifdef USE_ROCM
  const int num_elements_per_thread = 2;
-  const int num_threads_per_block = 512;
+  const int num_threads_per_block = 256;
  #else
  const int num_elements_per_thread = GridDim::maxElementsPerThread;
  const int num_threads_per_block = GridDim::maxThreadsPerBlock;
@@ -227,7 +227,7 @@ void BinaryElementWiseImpl(

  #ifdef USE_ROCM
  const int num_elements_per_thread = 2;
-  const int num_threads_per_block = 512;
+  const int num_threads_per_block = 256;
  #else
  const int num_elements_per_thread = GridDim::maxElementsPerThread;
  const int num_threads_per_block = GridDim::maxThreadsPerBlock;

--- a/onnxruntime/core/providers/cuda/cu_inc/elementwise_impl.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/elementwise_impl.cuh
@@ -10,7 +10,7 @@ namespace cuda {

 #ifdef USE_ROCM
 constexpr int kElementsPerThread = 2;
-constexpr int kThreadsPerBlock = 512;
+constexpr int kThreadsPerBlock = 256;
 #else
 constexpr int kElementsPerThread = GridDim::maxElementsPerThread;
 constexpr int kThreadsPerBlock = GridDim::maxThreadsPerBlock;

--- a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu
@@ -12,7 +12,7 @@ namespace cuda {
 namespace {
 #ifdef USE_ROCM
 constexpr int kNumElementsPerThread = 2;
-constexpr int kNumThreadsPerBlock = 512;
+constexpr int kNumThreadsPerBlock = 256;
 #else
 constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
 constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;

--- a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu
@@ -11,7 +11,7 @@ namespace cuda {
 namespace {
 #ifdef USE_ROCM
 constexpr int kNumElementsPerThread = 2;
-constexpr int kNumThreadsPerBlock = 512;
+constexpr int kNumThreadsPerBlock = 256;
 #else
 constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
 constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;

--- a/onnxruntime/core/providers/cuda/tensor/split_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/split_impl.cu
@@ -12,7 +12,7 @@ namespace cuda {
 namespace {
 #ifdef USE_ROCM
 constexpr int kNumElementsPerThread = 2;
-constexpr int kNumThreadsPerBlock = 512;
+constexpr int kNumThreadsPerBlock = 256;
 #else
 constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
 constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;

--- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu
@@ -9,7 +9,7 @@ namespace cuda {

 #ifdef USE_ROCM
 constexpr int num_elements_per_thread = 2;
-constexpr int num_threads_per_block = 512;
+constexpr int num_threads_per_block = 256;
 #else
 constexpr int num_elements_per_thread = GridDim::maxElementsPerThread;
 constexpr int num_threads_per_block = GridDim::maxThreadsPerBlock;

--- a/onnxruntime/core/providers/rocm/nn/bn_sugon.cu
+++ b/onnxruntime/core/providers/rocm/nn/bn_sugon.cu
+#include <hiprand.h>
+#include <rocblas.h>
+#include <hip/hip_runtime.h>
+#include "ort_sugon.cuh"
+#include <math.h>
+#include "bn_sugon.cuh"
+
+__global__ void batch_normal_kernel(int n,const float *im, const float *scale,const float *bias, const float *mean, 
+         const float *var, float *output,const int batch,const int channels,const int height,const int width,const int index4,const int index5)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int j_index5=id % index5;
+    id /= index5;
+
+    int j_index4=id % index4;
+    id /= index4;
+
+    int j = id % width;//列
+    id /= width;
+    int i = id % height;
+    id /= height;
+    int k = id % channels;
+    id /= channels;
+    int b = id;
+
+    float epsilon=0.00001;
+    int input_index=j_index5+index5*(j_index4+index4*(j+width*(i+height*(k+b*channels))));  //hsqrt支持半精度开方计算 __float2half   __half2float
+    output[input_index]=(im[input_index]-mean[k])/sqrt(var[k]+epsilon) *scale[k]+bias[k];
+}
+
+__global__ void batch_normal_kernel(int n,const __half *im, const __half *scale,const __half *bias, const __half *mean, 
+         const __half *var, __half *output,const int batch,const int channels,const int height,const int width,const int index4,const int index5)
+{
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int j_index5=id % index5;
+    id /= index5;
+
+    int j_index4=id % index4;
+    id /= index4;
+
+    int j = id % width;//列
+    id /= width;
+    int i = id % height;
+    id /= height;
+    int k = id % channels;
+    id /= channels;
+    int b = id;
+    // We can fuse the output computation as follows:
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    // to
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    int input_index=j_index5+index5*(j_index4+index4*(j+width*(i+height*(k+b*channels))));  //hsqrt支持半精度开方计算 __float2half   __half2float
+    const float val1 = var[k];
+    const float scale1 = scale[k];
+    const float mean1 = mean[k];
+    const float input = im[input_index];
+    const float bias1 = bias[k];
+    const float epsilon = 0.00001;
+    const float new_scale = scale1 /sqrt(val1+epsilon);
+    const float new_bias =  bias1 - mean1 * new_scale;
+    const float tmp = input*new_scale+new_bias ;
+
+    //output[input_index]=(im[input_index]-mean[k])/hsqrt(var[k]+epsilon) *scale[k]+bias[k];
+    output[input_index]=__float2half(tmp);
+}
+
+
+template <typename T>
+void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch,
+        const int channels,const int height,const int width,const int index4,const int index5){
+    
+    int num_kernels=channels*batch*height*width*index4*index5;
+    batch_normal_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,scale,bias,mean,var,output,batch,channels,height,width,index4,index5);
+}
+
+#define INSTANTIATEBATCH_NORMAL(T)  \
+template void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch, \
+        const int channels,const int height,const int width,const int index4,const int index5);
+
+INSTANTIATEBATCH_NORMAL(float)
+INSTANTIATEBATCH_NORMAL(half)
+
+
+
+
+
+
+
+
+
--- a/onnxruntime/core/providers/rocm/nn/bn_sugon.cuh
+++ b/onnxruntime/core/providers/rocm/nn/bn_sugon.cuh
+#ifndef BN_SUGON_H
+#define BN_SUGON_H
+
+#pragma once
+
+template <typename T>
+void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch,
+        const int channels,const int height,const int width,const int index4,const int index5);
+#endif
\ No newline at end of file
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -4,9 +4,21 @@
 #include "core/providers/rocm/nn/conv.h"
 #include "core/common/span_utils.h"
 #include "core/providers/rocm/rocm_common.h"
+#include "core/providers/rocm/math/gemm.h"
+#include "core/providers/cpu/math/gemm_helper.h"
 #include "core/providers/rocm/shared_inc/fpgeneric.h"
+#include "core/providers/rocm/tunable/gemm.h"
 #include "core/providers/rocm/tensor/slice.h"
+
+
+#include "core/providers/rocm/nn/im2col.cuh"
+#include "core/providers/rocm/nn/ort_sugon.cuh"
 #include <iostream>
+using namespace std;
+
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-result"
+#pragma GCC diagnostic ignored "-Wunused-variable"

 namespace onnxruntime {
 namespace rocm {
@@ -87,6 +99,173 @@ Status SliceOutUnwantedOutputSection(hipStream_t stream,
  return SliceRocm::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size);
 }

+
+template <typename T>
+Status Conv<T>::conv_im2col(OpKernelContext* context, bool bias_expected) const {
+  //set X
+  const Tensor* X = context->Input<Tensor>(0);
+  const Tensor* W = context->Input<Tensor>(1);
+  const Tensor* B = context->InputCount() >= 3 ? context->Input<Tensor>(2) : nullptr;
+  const Tensor* Sum = context->InputCount() >= 4 ? context->Input<Tensor>(3) : nullptr;
+  const int64_t N = X->Shape()[0];
+  const int64_t C = X->Shape()[1];
+  const int64_t M = W->Shape()[0];
+
+  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
+  // kernel_shape is an optional attribute and has to be inferred from W if not provided
+  TensorShapeVector kernel_shape;
+  ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
+
+  ConvAttributes::ConvPadVector pads(conv_attrs_.pads);
+  if (pads.empty()) {
+    pads.resize(kernel_shape.size() * 2, 0);
+  }  
+  TensorShapeVector dilations(conv_attrs_.dilations);
+  if (dilations.empty()) {
+    dilations.resize(kernel_shape.size(), 1);
+  }
+  TensorShapeVector strides(conv_attrs_.strides);
+  if (strides.empty()) {
+    strides.resize(kernel_shape.size(), 1);
+  }
+
+  TensorShapeVector Y_dims({N, M});
+  TensorShape input_shape = X->Shape().Slice(2);
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+  Tensor* Y = context->Output(0, TensorShape(Y_dims));
+  TensorShape output_shape = Y->Shape().Slice(2);
+
+  // Bail out early if one of the dimensions is zero.
+  if (Y->Shape().Size() == 0) {
+    return Status::OK();
+  }
+
+  AllocatorPtr alloc;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+  const auto* Xdata=reinterpret_cast<const HipT*>(X->Data<T>());
+  //const auto* Xdata = X->Data<float>();
+  //const auto* Bdata = B != nullptr ? B->Data<float>() : nullptr; 
+  const auto* Wdata = reinterpret_cast<const HipT*>(W->Data<T>());
+  const auto* Bdata = (B != nullptr) ?reinterpret_cast<const HipT*>(B->Data<T>()): nullptr;
+
+  //auto* Ydata = Y->MutableData<float>();
+  auto* Ydata = reinterpret_cast<HipT*> (Y->MutableData<T>());
+
+  // Check for the optional Conv/Sum fusion.
+  //float Beta = 0.0f;
+  if (Sum != nullptr) {
+    const auto& sum_shape = Sum->Shape();
+    ORT_RETURN_IF_NOT(Y->Shape() == sum_shape, "output and sum shape must match");
+    // If the output was not allocated inplace with the sum tensor, then copy here.
+    const auto* sum_data = reinterpret_cast<const HipT*>(Sum->Data<T>());
+    if (Ydata != sum_data) {
+      hipMemcpy(Ydata, sum_data, SafeInt<size_t>(sum_shape.Size()) * sizeof(HipT),hipMemcpyDeviceToDevice);
+  }
+  //   Beta = 1.0f;
+ }
+  const size_t kernel_rank = kernel_shape.size();
+
+  const int64_t input_image_size = input_shape.Size();
+  const int64_t output_image_size = output_shape.Size();
+  const int64_t kernel_size = TensorShape(kernel_shape).Size();
+  const int64_t X_offset = C / conv_attrs_.group * input_image_size;
+  const int64_t Y_offset = Y->Shape().Size() / Y->Shape()[0] / conv_attrs_.group;
+  const int64_t W_offset = W->Shape().Size() / conv_attrs_.group;
+  const int64_t kernel_dim = C / conv_attrs_.group * kernel_size;
+  const int64_t single_col_buffer_size = kernel_dim * output_image_size;
+
+  //展开IM2col过程所需要的临时变量
+  const int64_t col_buffer_size = kernel_dim *conv_attrs_.group *output_image_size;
+  const int64_t im2col_X_offset = C  * input_image_size;
+
+  auto* col_data = alloc->Alloc(sizeof(HipT) * SafeInt<size_t>(col_buffer_size));
+  BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc)));
+  auto* col_buffer_data =reinterpret_cast <HipT*>(col_buffer.get());//static_cast
+
+  const HipT zero = ToHipType<T>::FromFloat(0.f);
+  const float alpha = 1.0f;
+  //const float beta = 0.0f;
+
+
+  if(kernel_rank==2||kernel_rank==1){
+    
+    if (Bdata!=nullptr)
+    {
+      assign_bias_gpu<HipT>(Stream(),Ydata, Bdata, N, M, output_image_size);
+    }
+    else
+    {
+      //hipMemset(Ydata,zero,output_image_size); //将ydata初始化为0
+      assign_val_gpu<HipT>(Stream(),Ydata, zero,N,M, output_image_size);
+    }
+    
+    for (int image_id = 0; image_id < N; ++image_id) {
+      auto *temp_b = col_buffer_data;
+      auto *im_src = reinterpret_cast<const HipT*>(Xdata + (image_id)*im2col_X_offset); //X
+
+      if(kernel_rank==2)
+        im2col_gpu<HipT>(Stream(),im_src, C ,input_shape[0],input_shape[1],kernel_shape[0],kernel_shape[1],strides[0],strides[1],pads[0],pads[1],pads[2],pads[3],dilations[0],dilations[1],temp_b);
+      else if(kernel_rank==1)
+        im2col_gpu<HipT>(Stream(),im_src, C ,1,input_shape[0],1,kernel_shape[0],1,strides[0],0,pads[0],0,pads[1],1,dilations[0],temp_b);//最后一个0是padding的value
+
+      auto *a = Wdata ; //W
+      auto *b = col_buffer_data;
+      auto *c = Ydata + (image_id*conv_attrs_.group)*Y_offset;
+
+      const int stride_A = M/conv_attrs_.group*kernel_dim;
+      const int stride_B = output_image_size*kernel_dim;
+      const int stride_C = M/conv_attrs_.group*output_image_size;
+
+
+      ROCBLAS_RETURN_IF_ERROR(rocblasGemmStridedBatchedHelper(
+            RocblasHandle(),
+            rocblas_operation_none,
+            rocblas_operation_none,
+            static_cast<int>(output_image_size),static_cast<int>(M/conv_attrs_.group), static_cast<int>(kernel_dim),
+            &alpha,
+            b, static_cast<int>(output_image_size),stride_B,//x
+            a, static_cast<int>(kernel_dim),stride_A, //w
+            &alpha,
+            c, static_cast<int>(output_image_size),stride_C,
+            static_cast<int>(conv_attrs_.group)));
+
+      // for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
+
+      //   auto *a = W->Data<float>() + (group_id) * W_offset; //W
+      //   auto *b = col_buffer_data+(group_id)*single_col_buffer_size;
+      //   //auto *im = Xdata + (image_id*conv_attrs_.group+group_id)*X_offset; //X
+      //   auto *c = Ydata + (image_id*conv_attrs_.group+group_id)*Y_offset;
+
+      //   const float alpha = 1.0;
+      //   const float beta = 0.0;
+
+      //   ROCBLAS_RETURN_IF_ERROR(rocblasGemmHelper(
+      //       RocblasHandle(),
+      //       rocblas_operation_none,
+      //       rocblas_operation_none,
+      //       output_image_size,M/conv_attrs_.group, kernel_dim,
+      //       &alpha,
+      //       b, output_image_size,
+      //       a, kernel_dim,
+      //       &beta,
+      //       c, output_image_size));
+      //   }
+      }
+
+      // if (Bdata!=nullptr)
+      // {
+      //   //void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
+      //   add_bias_gpu<HipT>(Stream(),Ydata, Bdata, static_cast<int>(N), static_cast<int>(M), static_cast<int>(output_image_size));
+      // }
+
+
+  }
+
+  return Status::OK();
+}
+
+
+
 template <typename T>
 Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const {
  //set X
@@ -260,6 +439,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
    }
    const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
    s_.fwd_algo = perf.fwd_algo;
+    
    s_.workspace_bytes = perf.memory;
  } else {
    //set Y
@@ -280,7 +460,24 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
 template <typename T>
 Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
  std::lock_guard<OrtMutex> lock(s_.mutex);
+  //判断是否为二维卷积，假如是二维卷积的话，将采用im2col+gemm的形式进行卷积计算
+  const Tensor* W = context->Input<Tensor>(1);
+  TensorShapeVector kernel_shape;
+  ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
+  const size_t kernel_rank = kernel_shape.size();
+
+  if(kernel_rank==2||kernel_rank==1)
+  {
+      //std::cout<<"conv compute with im2col+gemm"<<std::endl;
+      ORT_RETURN_IF_ERROR(conv_im2col(context));
+      return Status::OK();
+  }
+  
+
+  //否则通过Miopen进行卷积计算
+  //std::cout<<"conv compute with miopen"<<std::endl;
  ORT_RETURN_IF_ERROR(UpdateState(context));
+
  if (s_.Y->Shape().Size() == 0) {
    return Status::OK();
  }

--- a/onnxruntime/core/providers/rocm/nn/conv.h
+++ b/onnxruntime/core/providers/rocm/nn/conv.h
@@ -191,6 +191,7 @@ class Conv : public RocmKernel {
  }

  Status UpdateState(OpKernelContext* context, bool bias_expected = false) const;
+  Status conv_im2col(OpKernelContext* context, bool bias_expected = false) const;
  ConvAttributes conv_attrs_;
  mutable MiopenConvState<miopenConvAlgoPerf_t> s_;
  constexpr static auto kDefaultConvAlgo = miopenConvolutionFwdAlgoGEMM;

--- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
@@ -2,6 +2,13 @@
 // Licensed under the MIT License.

 #include "conv_transpose.h"
+#include "core/providers/rocm/nn/im2col.cuh"
+#include "core/providers/rocm/nn/ort_sugon.cuh"
+#include <iostream>
+
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-result"
+#pragma GCC diagnostic ignored "-Wunused-variable"

 namespace onnxruntime {
 namespace rocm {
@@ -36,6 +43,116 @@ Status ConvTranspose<T>::ComputeInternal(OpKernelContext* context) const {
  return DoConvTranspose(context, false);
 }

+template <typename T>
+Status ConvTranspose<T>::ConvTranspose_col2im(OpKernelContext* context, bool dynamic_padding) const{
+  typedef typename ToHipType<T>::MappedType HipT;
+  size_t num_inputs = OpKernel::Node().InputDefs().size();
+
+  ConvTransposeAttributes::Prepare p;
+  bool has_bias = dynamic_padding ? num_inputs == 4 : num_inputs == 3;
+
+  ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(
+      context, has_bias, p, dynamic_padding, transposed_filter_ ? &filter_shape_ : nullptr));
+  
+  // Bail out early if one of the dimensions is zero.
+  if (p.Y->Shape().Size() == 0) {
+    return Status::OK();
+  }
+  
+  const int64_t input_image_size = p.input_shape.Size();
+  const int64_t X_offset = p.num_input_channels / conv_transpose_attrs_.group * input_image_size;
+  const int64_t Y_offset = p.Y->Shape().Size() / p.Y->Shape()[0] / conv_transpose_attrs_.group;
+  const int64_t W_offset = p.F->Shape().Size() / conv_transpose_attrs_.group;
+  const int64_t kernel_size = TensorShape(p.kernel_shape).Size();
+  const int64_t kernel_dim = p.num_output_channels / conv_transpose_attrs_.group * kernel_size;
+  const int64_t output_size = (p.Y->Shape().Slice(2)).Size();
+
+  AllocatorPtr alloc;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+
+  const int64_t col_buffer_size = kernel_dim * p.input_shape.Size();//*conv_transpose_attrs_.group ;
+  auto col_data = alloc->Alloc(SafeInt<size_t>(sizeof(float)) * col_buffer_size);
+  BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc)));
+  auto* col_buffer_data = reinterpret_cast<float*>(col_buffer.get());
+
+  const auto* Xdata =reinterpret_cast<const HipT*>(p.X->Data<T>());
+  const auto* filter_data = p.F ? reinterpret_cast<const HipT*>(p.F->Data<T>()) : reinterpret_cast< HipT*>(transposed_filter_.get());
+  auto* Ydata = reinterpret_cast< HipT*>(p.Y->MutableData<T>());
+  TensorShape output_shape = p.Y->Shape().Slice(2);
+
+  const HipT zero = ToHipType<T>::FromFloat(0.f);
+  const HipT one = ToHipType<T>::FromFloat(1.f);
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+
+  if (has_bias) {
+    const auto* Bdata =reinterpret_cast<const HipT*>(p.B->Data<T>());
+    const auto& b_shape = p.B->Shape();
+    ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D");
+    assign_bias_gpu<HipT>(Stream(),Ydata, Bdata, p.N, p.num_output_channels, output_size);
+  }
+  else{
+    assign_val_gpu<HipT>(Stream(),Ydata, zero, p.N, p.num_output_channels, output_size);
+  }
+
+  for (auto image_id = 0; image_id < p.N; ++image_id) {
+
+      // auto* a= filter_data ;//[C_in/G,C_out/G,k_h,h_w]--->展开[C_in/G, C_out/G*k_h*h_w]
+      // auto* b= Xdata + (image_id*conv_transpose_attrs_.group) * X_offset;//[1,C_in/G,H,W]--->展开[C_in/G, H*W]
+      // auto* y= Ydata + (image_id*conv_transpose_attrs_.group) * Y_offset;
+
+      // int stride_A = p.num_input_channels / conv_transpose_attrs_.group*kernel_dim;
+      // int stride_B = input_image_size*kernel_dim;
+      // int stride_C = p.num_input_channels / conv_transpose_attrs_.group*input_image_size;
+
+      // ROCBLAS_RETURN_IF_ERROR(rocblasGemmStridedBatchedHelper(
+      //     RocblasHandle(),
+      //     rocblas_operation_none,
+      //     p.F ? rocblas_operation_transpose:rocblas_operation_none,//rocblas_operation_transpose
+      //     static_cast<int>(input_image_size),static_cast<int>(kernel_dim),static_cast<int>(p.num_input_channels / conv_transpose_attrs_.group),
+      //     &alpha,
+      //     b, static_cast<int>(input_image_size),stride_B, //x
+      //     a, static_cast<int>(kernel_dim),stride_A, //w
+      //     &beta,
+      //     col_buffer_data, static_cast<int>(input_image_size),stride_C,conv_transpose_attrs_.group));
+
+    for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
+
+      //gemm
+      auto* a= filter_data + group_id * W_offset;//[C_in/G,C_out/G,k_h,h_w]--->展开[C_in/G, C_out/G*k_h*h_w]
+      auto* b= Xdata + (image_id*conv_transpose_attrs_.group+group_id) * X_offset;//[1,C_in/G,H,W]--->展开[C_in/G, H*W]
+      auto* y= Ydata + (image_id*conv_transpose_attrs_.group+group_id) * Y_offset;
+
+      ROCBLAS_RETURN_IF_ERROR(rocblasGemmHelper(
+          RocblasHandle(),
+          rocblas_operation_none,
+          p.F ? rocblas_operation_transpose:rocblas_operation_none,//rocblas_operation_transpose
+          static_cast<int>(input_image_size),static_cast<int>(kernel_dim),static_cast<int>(p.num_input_channels / conv_transpose_attrs_.group),
+          &one,
+          b, static_cast<int>(input_image_size), //x
+          a, static_cast<int>(kernel_dim), //w
+          &zero,
+          col_buffer_data, static_cast<int>(input_image_size)));
+
+      if (p.X->Shape().NumDimensions() == 4){
+        col2im_gpu<HipT>(Stream(),col_buffer_data,
+          p.num_output_channels / conv_transpose_attrs_.group, p.Y->Shape()[2], p.Y->Shape()[3],
+          p.kernel_shape[0],p.kernel_shape[1], p.strides[0],p.strides[1], 
+          p.pads[0],p.pads[1],p.pads[2],p.pads[3],
+          p.dilations[0],p.dilations[1], y);
+        } 
+      else if(p.X->Shape().NumDimensions() == 3)
+      {
+        col2im_gpu<HipT>(Stream(),col_buffer_data,
+          p.num_output_channels / conv_transpose_attrs_.group, 1, p.Y->Shape()[2],
+          1,p.kernel_shape[0],1, p.strides[0],0,p.pads[0],0,p.pads[1], 1,p.dilations[0], y);
+      }    
+      }
+    }
+  return Status::OK();
+}
+
+
 template <typename T>
 Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
  typedef typename ToHipType<T>::MappedType HipT;
@@ -43,14 +160,23 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
  const Tensor* X = context->Input<Tensor>(0);
  const TensorShape& x_shape = X->Shape();
  auto x_dims = x_shape.AsShapeVector();
-  auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());

  auto x_dimensions = X->Shape().NumDimensions();
+
  if (x_dimensions < 3 || x_dimensions > 5) {
    // TODO: the error message should tell which operator raises it.
    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input X must be 3-, 4- or 5-dimensional.",
                           " X: ", X->Shape().ToString().c_str());
  }
+
+  if(x_dimensions==4||x_dimensions==3)
+    {
+      ORT_RETURN_IF_ERROR(ConvTranspose_col2im(context,dynamic_padding));
+      return Status::OK();
+    }
+
+
+  auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
  const Tensor* W = context->Input<Tensor>(1);
  const TensorShape& w_shape = W->Shape();
  auto w_dims = w_shape.AsShapeVector();
@@ -181,8 +307,8 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
        miopenConvolutionBackwardData(
            MiopenHandle(),
            &alpha,
-	    s_.x_tensor,
-	    x_data,
+	          s_.x_tensor,
+	          x_data,
            s_.w_desc,
            w_data,
            s_.conv_desc,