Commit 25d7fde8 authored by gaoqiong's avatar gaoqiong
Browse files

lite

parent 8439d29f
......@@ -9,6 +9,7 @@ or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
"""
__version__ = "1.14.0"
__author__ = "Microsoft"
__dcu_version__ = "1.14.0+git556e6af.abi0.dtk2304"
# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
# in order to know whether the onnxruntime package is for training it needs
......
......@@ -191,7 +191,7 @@ class BatchNorm : public OpKernel {
EigenArrayMap<T> Y_arr(Y->MutableData<T>(),
is_spatial_ ? sample_size : sample_size_incl_all_channels,
is_spatial_ ? N * C : N);
if (is_spatial_) { // spatial == 1
for (size_t nc = 0; nc < N * C; ++nc) {
Y_arr.col(nc) = X_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
......
......@@ -73,6 +73,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
const size_t kernel_rank = kernel_shape.size();
BufferUniquePtr col_buffer;
//printf("***********<typename T>***********");
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
......@@ -126,6 +127,13 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
}
}
// std::cout<<"col_buffer_data:"<<std::endl;
// for(int i=0;i<200;++i)
// {
// printf("%f ",col_buffer_data[i]);
// }
// printf("\n");
math::Gemm<T>(
CblasNoTrans,
CblasNoTrans,
......@@ -163,6 +171,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
const int64_t C = X->Shape()[1];
const int64_t M = W->Shape()[0];
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
//printf("##############float###############");
// kernel_shape is an optional attribute and has to be inferred from W if not provided
TensorShapeVector kernel_shape;
......
......@@ -21,6 +21,7 @@
#include "core/common/safeint.h"
#include "core/util/math.h"
#include "core/util/math_cpuonly.h"
#include <iostream>
namespace onnxruntime {
......
......@@ -28,6 +28,14 @@ struct PoolAttributes {
const std::string& op_name, int start_version)
: global_pooling(IsGlobalPooling(op_name)) {
if (global_pooling) {
if(op_name == "GlobalAveragePool") {
global_average_pooling=true;
}
if(op_name == "GlobalMaxPool")
{
global_max_pooling=true;
}
return;
}
......@@ -62,11 +70,13 @@ struct PoolAttributes {
if (op_name == "AveragePool") {
int64_t temp;
average_pooling=true;
ORT_ENFORCE(info.GetAttr<int64_t>("count_include_pad", &temp).IsOK());
count_include_pad = (temp != 0);
}
if (op_name == "MaxPool") {
max_pooling= true;
if (start_version >= 8) {
ORT_ENFORCE(info.GetAttr("storage_order", &storage_order).IsOK());
}
......@@ -84,6 +94,10 @@ struct PoolAttributes {
}
const bool global_pooling;
bool max_pooling=false;
bool global_max_pooling=false;
bool average_pooling=false;
bool global_average_pooling=false;
bool count_include_pad{};
int64_t storage_order{0}; // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.
......
......@@ -191,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl(
#ifdef USE_ROCM
const int num_elements_per_thread = 2;
const int num_threads_per_block = 512;
const int num_threads_per_block = 256;
#else
const int num_elements_per_thread = GridDim::maxElementsPerThread;
const int num_threads_per_block = GridDim::maxThreadsPerBlock;
......@@ -227,7 +227,7 @@ void BinaryElementWiseImpl(
#ifdef USE_ROCM
const int num_elements_per_thread = 2;
const int num_threads_per_block = 512;
const int num_threads_per_block = 256;
#else
const int num_elements_per_thread = GridDim::maxElementsPerThread;
const int num_threads_per_block = GridDim::maxThreadsPerBlock;
......
......@@ -10,7 +10,7 @@ namespace cuda {
#ifdef USE_ROCM
constexpr int kElementsPerThread = 2;
constexpr int kThreadsPerBlock = 512;
constexpr int kThreadsPerBlock = 256;
#else
constexpr int kElementsPerThread = GridDim::maxElementsPerThread;
constexpr int kThreadsPerBlock = GridDim::maxThreadsPerBlock;
......
......@@ -12,7 +12,7 @@ namespace cuda {
namespace {
#ifdef USE_ROCM
constexpr int kNumElementsPerThread = 2;
constexpr int kNumThreadsPerBlock = 512;
constexpr int kNumThreadsPerBlock = 256;
#else
constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;
......
......@@ -11,7 +11,7 @@ namespace cuda {
namespace {
#ifdef USE_ROCM
constexpr int kNumElementsPerThread = 2;
constexpr int kNumThreadsPerBlock = 512;
constexpr int kNumThreadsPerBlock = 256;
#else
constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;
......
......@@ -12,7 +12,7 @@ namespace cuda {
namespace {
#ifdef USE_ROCM
constexpr int kNumElementsPerThread = 2;
constexpr int kNumThreadsPerBlock = 512;
constexpr int kNumThreadsPerBlock = 256;
#else
constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;
......
......@@ -9,7 +9,7 @@ namespace cuda {
#ifdef USE_ROCM
constexpr int num_elements_per_thread = 2;
constexpr int num_threads_per_block = 512;
constexpr int num_threads_per_block = 256;
#else
constexpr int num_elements_per_thread = GridDim::maxElementsPerThread;
constexpr int num_threads_per_block = GridDim::maxThreadsPerBlock;
......
#include <hiprand.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include "ort_sugon.cuh"
#include <math.h>
#include "bn_sugon.cuh"
__global__ void batch_normal_kernel(int n,const float *im, const float *scale,const float *bias, const float *mean,
const float *var, float *output,const int batch,const int channels,const int height,const int width,const int index4,const int index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % index5;
id /= index5;
int j_index4=id % index4;
id /= index4;
int j = id % width;//列
id /= width;
int i = id % height;
id /= height;
int k = id % channels;
id /= channels;
int b = id;
float epsilon=0.00001;
int input_index=j_index5+index5*(j_index4+index4*(j+width*(i+height*(k+b*channels)))); //hsqrt支持半精度开方计算 __float2half __half2float
output[input_index]=(im[input_index]-mean[k])/sqrt(var[k]+epsilon) *scale[k]+bias[k];
}
__global__ void batch_normal_kernel(int n,const __half *im, const __half *scale,const __half *bias, const __half *mean,
const __half *var, __half *output,const int batch,const int channels,const int height,const int width,const int index4,const int index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % index5;
id /= index5;
int j_index4=id % index4;
id /= index4;
int j = id % width;//列
id /= width;
int i = id % height;
id /= height;
int k = id % channels;
id /= channels;
int b = id;
// We can fuse the output computation as follows:
// ((x - est_mean) * (inv_var) * scale + bias
// to
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
int input_index=j_index5+index5*(j_index4+index4*(j+width*(i+height*(k+b*channels)))); //hsqrt支持半精度开方计算 __float2half __half2float
const float val1 = var[k];
const float scale1 = scale[k];
const float mean1 = mean[k];
const float input = im[input_index];
const float bias1 = bias[k];
const float epsilon = 0.00001;
const float new_scale = scale1 /sqrt(val1+epsilon);
const float new_bias = bias1 - mean1 * new_scale;
const float tmp = input*new_scale+new_bias ;
//output[input_index]=(im[input_index]-mean[k])/hsqrt(var[k]+epsilon) *scale[k]+bias[k];
output[input_index]=__float2half(tmp);
}
template <typename T>
void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch,
const int channels,const int height,const int width,const int index4,const int index5){
int num_kernels=channels*batch*height*width*index4*index5;
batch_normal_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,scale,bias,mean,var,output,batch,channels,height,width,index4,index5);
}
#define INSTANTIATEBATCH_NORMAL(T) \
template void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch, \
const int channels,const int height,const int width,const int index4,const int index5);
INSTANTIATEBATCH_NORMAL(float)
INSTANTIATEBATCH_NORMAL(half)
#ifndef BN_SUGON_H
#define BN_SUGON_H
#pragma once
template <typename T>
void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch,
const int channels,const int height,const int width,const int index4,const int index5);
#endif
\ No newline at end of file
......@@ -4,9 +4,21 @@
#include "core/providers/rocm/nn/conv.h"
#include "core/common/span_utils.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/math/gemm.h"
#include "core/providers/cpu/math/gemm_helper.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
#include "core/providers/rocm/tunable/gemm.h"
#include "core/providers/rocm/tensor/slice.h"
#include "core/providers/rocm/nn/im2col.cuh"
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <iostream>
using namespace std;
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wunused-variable"
namespace onnxruntime {
namespace rocm {
......@@ -87,6 +99,173 @@ Status SliceOutUnwantedOutputSection(hipStream_t stream,
return SliceRocm::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size);
}
template <typename T>
Status Conv<T>::conv_im2col(OpKernelContext* context, bool bias_expected) const {
//set X
const Tensor* X = context->Input<Tensor>(0);
const Tensor* W = context->Input<Tensor>(1);
const Tensor* B = context->InputCount() >= 3 ? context->Input<Tensor>(2) : nullptr;
const Tensor* Sum = context->InputCount() >= 4 ? context->Input<Tensor>(3) : nullptr;
const int64_t N = X->Shape()[0];
const int64_t C = X->Shape()[1];
const int64_t M = W->Shape()[0];
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
// kernel_shape is an optional attribute and has to be inferred from W if not provided
TensorShapeVector kernel_shape;
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
ConvAttributes::ConvPadVector pads(conv_attrs_.pads);
if (pads.empty()) {
pads.resize(kernel_shape.size() * 2, 0);
}
TensorShapeVector dilations(conv_attrs_.dilations);
if (dilations.empty()) {
dilations.resize(kernel_shape.size(), 1);
}
TensorShapeVector strides(conv_attrs_.strides);
if (strides.empty()) {
strides.resize(kernel_shape.size(), 1);
}
TensorShapeVector Y_dims({N, M});
TensorShape input_shape = X->Shape().Slice(2);
ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
Tensor* Y = context->Output(0, TensorShape(Y_dims));
TensorShape output_shape = Y->Shape().Slice(2);
// Bail out early if one of the dimensions is zero.
if (Y->Shape().Size() == 0) {
return Status::OK();
}
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
const auto* Xdata=reinterpret_cast<const HipT*>(X->Data<T>());
//const auto* Xdata = X->Data<float>();
//const auto* Bdata = B != nullptr ? B->Data<float>() : nullptr;
const auto* Wdata = reinterpret_cast<const HipT*>(W->Data<T>());
const auto* Bdata = (B != nullptr) ?reinterpret_cast<const HipT*>(B->Data<T>()): nullptr;
//auto* Ydata = Y->MutableData<float>();
auto* Ydata = reinterpret_cast<HipT*> (Y->MutableData<T>());
// Check for the optional Conv/Sum fusion.
//float Beta = 0.0f;
if (Sum != nullptr) {
const auto& sum_shape = Sum->Shape();
ORT_RETURN_IF_NOT(Y->Shape() == sum_shape, "output and sum shape must match");
// If the output was not allocated inplace with the sum tensor, then copy here.
const auto* sum_data = reinterpret_cast<const HipT*>(Sum->Data<T>());
if (Ydata != sum_data) {
hipMemcpy(Ydata, sum_data, SafeInt<size_t>(sum_shape.Size()) * sizeof(HipT),hipMemcpyDeviceToDevice);
}
// Beta = 1.0f;
}
const size_t kernel_rank = kernel_shape.size();
const int64_t input_image_size = input_shape.Size();
const int64_t output_image_size = output_shape.Size();
const int64_t kernel_size = TensorShape(kernel_shape).Size();
const int64_t X_offset = C / conv_attrs_.group * input_image_size;
const int64_t Y_offset = Y->Shape().Size() / Y->Shape()[0] / conv_attrs_.group;
const int64_t W_offset = W->Shape().Size() / conv_attrs_.group;
const int64_t kernel_dim = C / conv_attrs_.group * kernel_size;
const int64_t single_col_buffer_size = kernel_dim * output_image_size;
//展开IM2col过程所需要的临时变量
const int64_t col_buffer_size = kernel_dim *conv_attrs_.group *output_image_size;
const int64_t im2col_X_offset = C * input_image_size;
auto* col_data = alloc->Alloc(sizeof(HipT) * SafeInt<size_t>(col_buffer_size));
BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc)));
auto* col_buffer_data =reinterpret_cast <HipT*>(col_buffer.get());//static_cast
const HipT zero = ToHipType<T>::FromFloat(0.f);
const float alpha = 1.0f;
//const float beta = 0.0f;
if(kernel_rank==2||kernel_rank==1){
if (Bdata!=nullptr)
{
assign_bias_gpu<HipT>(Stream(),Ydata, Bdata, N, M, output_image_size);
}
else
{
//hipMemset(Ydata,zero,output_image_size); //将ydata初始化为0
assign_val_gpu<HipT>(Stream(),Ydata, zero,N,M, output_image_size);
}
for (int image_id = 0; image_id < N; ++image_id) {
auto *temp_b = col_buffer_data;
auto *im_src = reinterpret_cast<const HipT*>(Xdata + (image_id)*im2col_X_offset); //X
if(kernel_rank==2)
im2col_gpu<HipT>(Stream(),im_src, C ,input_shape[0],input_shape[1],kernel_shape[0],kernel_shape[1],strides[0],strides[1],pads[0],pads[1],pads[2],pads[3],dilations[0],dilations[1],temp_b);
else if(kernel_rank==1)
im2col_gpu<HipT>(Stream(),im_src, C ,1,input_shape[0],1,kernel_shape[0],1,strides[0],0,pads[0],0,pads[1],1,dilations[0],temp_b);//最后一个0是padding的value
auto *a = Wdata ; //W
auto *b = col_buffer_data;
auto *c = Ydata + (image_id*conv_attrs_.group)*Y_offset;
const int stride_A = M/conv_attrs_.group*kernel_dim;
const int stride_B = output_image_size*kernel_dim;
const int stride_C = M/conv_attrs_.group*output_image_size;
ROCBLAS_RETURN_IF_ERROR(rocblasGemmStridedBatchedHelper(
RocblasHandle(),
rocblas_operation_none,
rocblas_operation_none,
static_cast<int>(output_image_size),static_cast<int>(M/conv_attrs_.group), static_cast<int>(kernel_dim),
&alpha,
b, static_cast<int>(output_image_size),stride_B,//x
a, static_cast<int>(kernel_dim),stride_A, //w
&alpha,
c, static_cast<int>(output_image_size),stride_C,
static_cast<int>(conv_attrs_.group)));
// for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
// auto *a = W->Data<float>() + (group_id) * W_offset; //W
// auto *b = col_buffer_data+(group_id)*single_col_buffer_size;
// //auto *im = Xdata + (image_id*conv_attrs_.group+group_id)*X_offset; //X
// auto *c = Ydata + (image_id*conv_attrs_.group+group_id)*Y_offset;
// const float alpha = 1.0;
// const float beta = 0.0;
// ROCBLAS_RETURN_IF_ERROR(rocblasGemmHelper(
// RocblasHandle(),
// rocblas_operation_none,
// rocblas_operation_none,
// output_image_size,M/conv_attrs_.group, kernel_dim,
// &alpha,
// b, output_image_size,
// a, kernel_dim,
// &beta,
// c, output_image_size));
// }
}
// if (Bdata!=nullptr)
// {
// //void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
// add_bias_gpu<HipT>(Stream(),Ydata, Bdata, static_cast<int>(N), static_cast<int>(M), static_cast<int>(output_image_size));
// }
}
return Status::OK();
}
template <typename T>
Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const {
//set X
......@@ -260,6 +439,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
}
const auto& perf = s_.cached_benchmark_fwd_results.at(x_dims_miopen);
s_.fwd_algo = perf.fwd_algo;
s_.workspace_bytes = perf.memory;
} else {
//set Y
......@@ -280,7 +460,24 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
template <typename T>
Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
std::lock_guard<OrtMutex> lock(s_.mutex);
//判断是否为二维卷积,假如是二维卷积的话,将采用im2col+gemm的形式进行卷积计算
const Tensor* W = context->Input<Tensor>(1);
TensorShapeVector kernel_shape;
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
const size_t kernel_rank = kernel_shape.size();
if(kernel_rank==2||kernel_rank==1)
{
//std::cout<<"conv compute with im2col+gemm"<<std::endl;
ORT_RETURN_IF_ERROR(conv_im2col(context));
return Status::OK();
}
//否则通过Miopen进行卷积计算
//std::cout<<"conv compute with miopen"<<std::endl;
ORT_RETURN_IF_ERROR(UpdateState(context));
if (s_.Y->Shape().Size() == 0) {
return Status::OK();
}
......
......@@ -191,6 +191,7 @@ class Conv : public RocmKernel {
}
Status UpdateState(OpKernelContext* context, bool bias_expected = false) const;
Status conv_im2col(OpKernelContext* context, bool bias_expected = false) const;
ConvAttributes conv_attrs_;
mutable MiopenConvState<miopenConvAlgoPerf_t> s_;
constexpr static auto kDefaultConvAlgo = miopenConvolutionFwdAlgoGEMM;
......
......@@ -2,6 +2,13 @@
// Licensed under the MIT License.
#include "conv_transpose.h"
#include "core/providers/rocm/nn/im2col.cuh"
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <iostream>
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wunused-variable"
namespace onnxruntime {
namespace rocm {
......@@ -36,6 +43,116 @@ Status ConvTranspose<T>::ComputeInternal(OpKernelContext* context) const {
return DoConvTranspose(context, false);
}
template <typename T>
Status ConvTranspose<T>::ConvTranspose_col2im(OpKernelContext* context, bool dynamic_padding) const{
typedef typename ToHipType<T>::MappedType HipT;
size_t num_inputs = OpKernel::Node().InputDefs().size();
ConvTransposeAttributes::Prepare p;
bool has_bias = dynamic_padding ? num_inputs == 4 : num_inputs == 3;
ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(
context, has_bias, p, dynamic_padding, transposed_filter_ ? &filter_shape_ : nullptr));
// Bail out early if one of the dimensions is zero.
if (p.Y->Shape().Size() == 0) {
return Status::OK();
}
const int64_t input_image_size = p.input_shape.Size();
const int64_t X_offset = p.num_input_channels / conv_transpose_attrs_.group * input_image_size;
const int64_t Y_offset = p.Y->Shape().Size() / p.Y->Shape()[0] / conv_transpose_attrs_.group;
const int64_t W_offset = p.F->Shape().Size() / conv_transpose_attrs_.group;
const int64_t kernel_size = TensorShape(p.kernel_shape).Size();
const int64_t kernel_dim = p.num_output_channels / conv_transpose_attrs_.group * kernel_size;
const int64_t output_size = (p.Y->Shape().Slice(2)).Size();
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
const int64_t col_buffer_size = kernel_dim * p.input_shape.Size();//*conv_transpose_attrs_.group ;
auto col_data = alloc->Alloc(SafeInt<size_t>(sizeof(float)) * col_buffer_size);
BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc)));
auto* col_buffer_data = reinterpret_cast<float*>(col_buffer.get());
const auto* Xdata =reinterpret_cast<const HipT*>(p.X->Data<T>());
const auto* filter_data = p.F ? reinterpret_cast<const HipT*>(p.F->Data<T>()) : reinterpret_cast< HipT*>(transposed_filter_.get());
auto* Ydata = reinterpret_cast< HipT*>(p.Y->MutableData<T>());
TensorShape output_shape = p.Y->Shape().Slice(2);
const HipT zero = ToHipType<T>::FromFloat(0.f);
const HipT one = ToHipType<T>::FromFloat(1.f);
const float alpha = 1.0f;
const float beta = 0.0f;
if (has_bias) {
const auto* Bdata =reinterpret_cast<const HipT*>(p.B->Data<T>());
const auto& b_shape = p.B->Shape();
ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D");
assign_bias_gpu<HipT>(Stream(),Ydata, Bdata, p.N, p.num_output_channels, output_size);
}
else{
assign_val_gpu<HipT>(Stream(),Ydata, zero, p.N, p.num_output_channels, output_size);
}
for (auto image_id = 0; image_id < p.N; ++image_id) {
// auto* a= filter_data ;//[C_in/G,C_out/G,k_h,h_w]--->展开[C_in/G, C_out/G*k_h*h_w]
// auto* b= Xdata + (image_id*conv_transpose_attrs_.group) * X_offset;//[1,C_in/G,H,W]--->展开[C_in/G, H*W]
// auto* y= Ydata + (image_id*conv_transpose_attrs_.group) * Y_offset;
// int stride_A = p.num_input_channels / conv_transpose_attrs_.group*kernel_dim;
// int stride_B = input_image_size*kernel_dim;
// int stride_C = p.num_input_channels / conv_transpose_attrs_.group*input_image_size;
// ROCBLAS_RETURN_IF_ERROR(rocblasGemmStridedBatchedHelper(
// RocblasHandle(),
// rocblas_operation_none,
// p.F ? rocblas_operation_transpose:rocblas_operation_none,//rocblas_operation_transpose
// static_cast<int>(input_image_size),static_cast<int>(kernel_dim),static_cast<int>(p.num_input_channels / conv_transpose_attrs_.group),
// &alpha,
// b, static_cast<int>(input_image_size),stride_B, //x
// a, static_cast<int>(kernel_dim),stride_A, //w
// &beta,
// col_buffer_data, static_cast<int>(input_image_size),stride_C,conv_transpose_attrs_.group));
for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
//gemm
auto* a= filter_data + group_id * W_offset;//[C_in/G,C_out/G,k_h,h_w]--->展开[C_in/G, C_out/G*k_h*h_w]
auto* b= Xdata + (image_id*conv_transpose_attrs_.group+group_id) * X_offset;//[1,C_in/G,H,W]--->展开[C_in/G, H*W]
auto* y= Ydata + (image_id*conv_transpose_attrs_.group+group_id) * Y_offset;
ROCBLAS_RETURN_IF_ERROR(rocblasGemmHelper(
RocblasHandle(),
rocblas_operation_none,
p.F ? rocblas_operation_transpose:rocblas_operation_none,//rocblas_operation_transpose
static_cast<int>(input_image_size),static_cast<int>(kernel_dim),static_cast<int>(p.num_input_channels / conv_transpose_attrs_.group),
&one,
b, static_cast<int>(input_image_size), //x
a, static_cast<int>(kernel_dim), //w
&zero,
col_buffer_data, static_cast<int>(input_image_size)));
if (p.X->Shape().NumDimensions() == 4){
col2im_gpu<HipT>(Stream(),col_buffer_data,
p.num_output_channels / conv_transpose_attrs_.group, p.Y->Shape()[2], p.Y->Shape()[3],
p.kernel_shape[0],p.kernel_shape[1], p.strides[0],p.strides[1],
p.pads[0],p.pads[1],p.pads[2],p.pads[3],
p.dilations[0],p.dilations[1], y);
}
else if(p.X->Shape().NumDimensions() == 3)
{
col2im_gpu<HipT>(Stream(),col_buffer_data,
p.num_output_channels / conv_transpose_attrs_.group, 1, p.Y->Shape()[2],
1,p.kernel_shape[0],1, p.strides[0],0,p.pads[0],0,p.pads[1], 1,p.dilations[0], y);
}
}
}
return Status::OK();
}
template <typename T>
Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
typedef typename ToHipType<T>::MappedType HipT;
......@@ -43,14 +160,23 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
const Tensor* X = context->Input<Tensor>(0);
const TensorShape& x_shape = X->Shape();
auto x_dims = x_shape.AsShapeVector();
auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
auto x_dimensions = X->Shape().NumDimensions();
if (x_dimensions < 3 || x_dimensions > 5) {
// TODO: the error message should tell which operator raises it.
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input X must be 3-, 4- or 5-dimensional.",
" X: ", X->Shape().ToString().c_str());
}
if(x_dimensions==4||x_dimensions==3)
{
ORT_RETURN_IF_ERROR(ConvTranspose_col2im(context,dynamic_padding));
return Status::OK();
}
auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
const Tensor* W = context->Input<Tensor>(1);
const TensorShape& w_shape = W->Shape();
auto w_dims = w_shape.AsShapeVector();
......@@ -181,8 +307,8 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
miopenConvolutionBackwardData(
MiopenHandle(),
&alpha,
s_.x_tensor,
x_data,
s_.x_tensor,
x_data,
s_.w_desc,
w_data,
s_.conv_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment