Commit 25d7fde8 authored by gaoqiong's avatar gaoqiong
Browse files

lite

parent 8439d29f
......@@ -8,6 +8,12 @@
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/rocm/nn/conv.h"
#include "core/providers/cpu/nn/conv_transpose_attributes.h"
#include "core/common/span_utils.h"
#include "core/providers/rocm/math/gemm.h"
#include "core/providers/cpu/math/gemm_helper.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
#include "core/providers/rocm/tunable/gemm.h"
#include "core/common/safeint.h"
namespace onnxruntime {
namespace rocm {
......@@ -18,11 +24,15 @@ class ConvTranspose : public RocmKernel {
ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info){};
Status ComputeInternal(OpKernelContext* context) const override;
Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const;
Status ConvTranspose_col2im(OpKernelContext* context, bool dynamic_padding) const;
private:
ConvTransposeAttributes conv_transpose_attrs_;
mutable MiopenConvState<miopenConvAlgoPerf_t> s_;
// for pre-packing usage
TensorShape filter_shape_;
BufferUniquePtr transposed_filter_;
};
} // namespace rocm
......
#ifndef IM2COL_H
#define IM2COL_H
#pragma once
#include <cassert>
template <typename T>
void im2col_gpu(hipStream_t stream,const T *im,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,
T *data_col);
template <typename T>
void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
template <typename T>
void assign_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
template <typename T>
void assign_val_gpu(hipStream_t stream,T *output,const T val,const int batch,const int c_out,const int out_putsize);
template <typename T>
void col2im_gpu(hipStream_t stream,const float *data_col,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,
T *im);
#endif
#include <hiprand.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include "im2col.cuh"
#include "ort_sugon.cuh"
template <typename T>
__global__ void im2col_gpu_kernel(const int n, const T* data_im,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_t, const int pad_l,const int pad_b,const int pad_r,
const int stride_h,const int stride_w,const int dilation_h,const int dilation_w,
const int output_h, const int output_w,
T *data_col,const T padding_value
) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x){//
int w_out = index % output_w; //线程池的w_out
int h_index = index / output_w;
int h_out = h_index % output_h;
int channel_in = h_index / output_h;
int channel_out = channel_in * ksize_h * ksize_w;
int h_in = h_out * stride_h - pad_t;
int w_in = w_out * stride_w - pad_l;
T* data_col_ptr = data_col;
data_col_ptr += (channel_out * output_h + h_out) * output_w + w_out;
const T* data_im_ptr = data_im;
data_im_ptr += (channel_in * height + h_in) * width + w_in;
for (int i = 0; i < ksize_h; ++i) {
for (int j = 0; j < ksize_w; ++j) {
int h = h_in + i*dilation_h;
int w = w_in + j*dilation_w;
*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
data_im_ptr[i*dilation_h * width + j*dilation_w] : padding_value;
data_col_ptr += output_h * output_w;
}
}
}
}
template <typename T>
void im2col_gpu(hipStream_t stream,const T *im,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,
T *data_col){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
const T padding_value = 0;
int output_h = (height + pad_b + pad_t - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int output_w = (width + pad_l + pad_r - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * output_h * output_w;
im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
BLOCK,0,stream>>>(
num_kernels, im, height, width, ksize_h,ksize_w, pad_t,pad_l,pad_b,pad_r,
stride_h,stride_w,dilation_h, dilation_w,
output_h,output_w, data_col,padding_value);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at im2col_gpu\n ");
//printf("im2col kernel done\n ");
}
#define INSTANTIATEIM2COL_GPU(T) \
template void im2col_gpu<T>(hipStream_t stream,const T *im,int channels, int height, int width,int ksize_h,int ksize_w, int stride_h,\
int stride_w,int pad_t,int pad_l,int pad_b,int pad_r,int dilation_h,int dilation_w,T *data_col);
INSTANTIATEIM2COL_GPU(float)
INSTANTIATEIM2COL_GPU(half)
template <typename T>
__global__ void add_bias_kernel(T *output,const T *biases,const int batch,const int channels,const int out_putsize)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index >= channels*out_putsize*batch) {
return;
}
int i = index % out_putsize;
index /= out_putsize;
int j = index % channels; //通道索引
index /= channels;
int k = index; //batch s索引
output[(k*channels+j)*out_putsize + i] += biases[j];
}
template <typename T>
void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
{
int num_kernels = c_out*out_putsize*batch;
add_bias_kernel<<<(num_kernels+BLOCK-1)/BLOCK, BLOCK,0,stream>>>(output, biases, batch, c_out, out_putsize);
}
#define INSTANTIATEADD_BIAS_GPU(T) \
template void add_bias_gpu<T>(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
INSTANTIATEADD_BIAS_GPU(float)
INSTANTIATEADD_BIAS_GPU(half)
template <typename T>
__global__ void assign_bias_kernel(T *output,const T *biases,const int batch,const int channels,const int out_putsize)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index >= channels*out_putsize*batch) {
return;
}
int i = index % out_putsize;
index /= out_putsize;
int j = index % channels; //通道索引
index /= channels;
int k = index; //batch s索引
output[(k*channels+j)*out_putsize + i] = biases[j];
}
template <typename T>
void assign_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
{
int num_kernels = c_out*out_putsize*batch;
assign_bias_kernel<<<(num_kernels+BLOCK-1)/BLOCK, BLOCK,0,stream>>>(output, biases, batch, c_out, out_putsize);
}
#define INSTANTIATEASSIGN_BIAS_GPU(T) \
template void assign_bias_gpu<T>(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
INSTANTIATEASSIGN_BIAS_GPU(float)
INSTANTIATEASSIGN_BIAS_GPU(half)
template <typename T>
__global__ void assign_val_kernel(T *output,const T val,const int batch,const int channels,const int out_putsize)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index >= channels*out_putsize*batch) {
return;
}
int i = index % out_putsize;
index /= out_putsize;
int j = index % channels; //通道索引
index /= channels;
int k = index; //batch s索引
output[(k*channels+j)*out_putsize + i] = val;
}
template <typename T>
void assign_val_gpu(hipStream_t stream,T *output,const T val,const int batch,const int c_out,const int out_putsize)
{
int num_kernels = c_out*out_putsize*batch;
assign_val_kernel<<<(num_kernels+BLOCK-1)/BLOCK, BLOCK,0,stream>>>(output, val, batch, c_out, out_putsize);
}
#define INSTANTIATEASSIGN_VAL_GPU(T) \
template void assign_val_gpu<T>(hipStream_t stream,T *output,T val,const int batch,const int c_out,const int out_putsize);
INSTANTIATEASSIGN_VAL_GPU(float)
INSTANTIATEASSIGN_VAL_GPU(half)
__global__ void col2im_gpu_kernel(const int n, const float* data_col,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_t, const int pad_l,const int pad_b,const int pad_r,
const int stride_h,const int stride_w,const int dilation_h,const int dilation_w,
const int output_h, const int output_w,
float *data_im
) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x)
{
float val = 0.0;
int w = index % width + pad_l;
int h = (index / width) % height + pad_t;
int c = index / (width * height);
int dkernel_h = dilation_h * (ksize_h - 1) + 1;
int dkernel_w = dilation_w * (ksize_w - 1) + 1;
int w_col_start = (w < dkernel_w) ? 0 : (w - dkernel_w) / stride_w + 1;//横向最早覆盖该元素的滤波器Index
int w_col_end = min(w / stride_w + 1, output_w); //横向最迟覆盖该元素的滤波器Index
int h_col_start = (h < dkernel_h) ? 0 : (h - dkernel_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, output_h);
// int offset =(c * ksize_h * ksize_w + h * ksize_h + w) * output_h * output_w;
// int coeff_h_col = (1 - stride_h * ksize_h * output_h) * output_w;
// int coeff_w_col = (1 - stride_w * output_h * output_w);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
//val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
int k_index_w= (w-w_col*stride_w)%dilation_w;
int k_index_h= (h-h_col*stride_h)%dilation_h;//对应在滤波器中的相对位置是否有余数,有余数则无效
if(k_index_w!=0||k_index_h!=0) continue;
// int offset=(c*ksize_h*ksize_w +k_index_h*ksize_w+ k_index_w)*output_h * output_w;//前面的层所在的偏移位置
// int offset_=(h_col*output_h+w_col);//在当前层所在的偏移位置
// val += data_col[offset+offset_];
int c_col = c * ksize_h*ksize_w + (h - h_col * stride_h)/dilation_h * ksize_h + (w - w_col * stride_w)/dilation_w;
val += data_col[(c_col * output_h + h_col) * output_w + w_col];
}
}
data_im[index] += val;
}
}
__global__ void col2im_gpu_kernel(const int n, const float* data_col,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_t, const int pad_l,const int pad_b,const int pad_r,
const int stride_h,const int stride_w,const int dilation_h,const int dilation_w,
const int output_h, const int output_w,
__half *data_im
) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x)
{
float val = __half2float(data_im[index]);
int w = index % width + pad_l;
int h = (index / width) % height + pad_t;
int c = index / (width * height);
int dkernel_h = dilation_h * (ksize_h - 1) + 1;
int dkernel_w = dilation_w * (ksize_w - 1) + 1;
int w_col_start = (w < dkernel_w) ? 0 : (w - dkernel_w) / stride_w + 1;//横向最早覆盖该元素的滤波器Index
int w_col_end = min(w / stride_w + 1, output_w); //横向最迟覆盖该元素的滤波器Index
int h_col_start = (h < dkernel_h) ? 0 : (h - dkernel_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, output_h);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int k_index_w= (w-w_col*stride_w)%dilation_w;
int k_index_h= (h-h_col*stride_h)%dilation_h;//对应在滤波器中的相对位置是否有余数,有余数则无效
if(k_index_w!=0||k_index_h!=0) continue;
int c_col = c * ksize_h*ksize_w + (h - h_col * stride_h)/dilation_h * ksize_h + (w - w_col * stride_w)/dilation_w;
val += data_col[(c_col * output_h + h_col) * output_w + w_col];
}
}
data_im[index] = __float2half(val);
}
}
template <typename T>
void col2im_gpu(hipStream_t stream,const float *data_col,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,T *im){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
const int64_t dkernel_h = dilation_h * (ksize_h - 1) + 1;
const int64_t dkernel_w = dilation_w * (ksize_w - 1) + 1;
int output_h = (height + pad_b + pad_t - dkernel_h) / stride_h + 1;
int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
int num_kernels = channels * height * width;
// dim3 block_size(BLOCK, 1, 1);
// dim3 grid_size((num_kernels + block_size.x - 1) / block_size.x, 1, 1);
// hipLaunchKernelGGL(col2im_gpu_kernel<T>, grid_size, block_size,0,stream,
// num_kernels, data_col, height, width, ksize_h,ksize_w, pad_t,pad_l,pad_b,pad_r,
// stride_h,stride_w,dilation_h, dilation_w,
// output_h,output_w, im );
col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(
num_kernels, data_col, height, width, ksize_h,ksize_w, pad_t,pad_l,pad_b,pad_r,
stride_h,stride_w,dilation_h, dilation_w,output_h,output_w, im);
//printf("col2im_gpu done\n");
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at col2im_gpu\n ");
}
#define INSTANTIATECOL2IM_GPU(T) \
template void col2im_gpu<T>(hipStream_t stream,const float *data_col, \
int channels, int height, int width, \
int ksize_h,int ksize_w, int stride_h,int stride_w, \
int pad_t,int pad_l,int pad_b,int pad_r, \
int dilation_h,int dilation_w, \
T *im);
INSTANTIATECOL2IM_GPU(float)
INSTANTIATECOL2IM_GPU(half)
\ No newline at end of file
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <stdio.h>
#include <stdlib.h>
bool get_miopenenv_miopen()
{
const char *ptr_name = "ORT_MIOPEN_ENABLE";
char *ptr_value = getenv(ptr_name);
if(ptr_value==nullptr || *ptr_value=='0') {
//printf("ORT_MIOPEN_ENABLE :0\n");
return false;
}
else if(*ptr_value=='1'){
//printf("ORT_MIOPEN_ENABLE :1\n");
return true;
}
else return false;
}
#ifndef ORT_SUGON_H
#define ORT_SUGON_H
#define BLOCK 256
bool get_miopenenv_miopen();
//inline bool miopen_enable=get_miopenenv_miopen();
// dim3 hip_gridsize(const int n);
#endif
#include <hiprand.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include "pool_sugon.cuh"
#include "ort_sugon.cuh"
//#include "core/providers/rocm/nn/pool_sugon.cuh"
//#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <math.h>
__global__ void max_pool2d_kernel(const int n, const int in_c,const int in_h,const int in_w,const int ksize_h,const int ksize_w,
const int stride_h,const int stride_w,const int pad_t,const int pad_l,const int pad_b,const int pad_r, const int out_height,const int out_width,
const float *input, float *output)
{
int h = out_height;
int w = out_width;
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;
int w_offset = -pad_l;
int h_offset = -pad_t;
int out_index = j + w*(i + h*(k + c*b));
float max = -INFINITY;
int l, m;
for(l = 0; l < ksize_h; ++l){
for(m = 0; m < ksize_w; ++m){
int cur_h = h_offset + i*stride_h + l;
int cur_w = w_offset + j*stride_w + m;
int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
int valid = (cur_h >= 0 && cur_h < in_h &&
cur_w >= 0 && cur_w < in_w);
float val = (valid != 0) ? input[index] : -INFINITY;
max = (val > max) ? val : max;
}
}
output[out_index] = max;
}
__global__ void max_pool2d_kernel(const int n, const int in_c,const int in_h,const int in_w,const int ksize_h,const int ksize_w,
const int stride_h,const int stride_w,const int pad_t,const int pad_l,const int pad_b,const int pad_r, const int out_height,const int out_width,
const __half *input, __half *output)
{
int h = out_height;
int w = out_width;
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;
int w_offset = -pad_l;
int h_offset = -pad_t;
int out_index = j + w*(i + h*(k + c*b));
__half max = __float2half(-INFINITY);
__half min_val= __float2half(-INFINITY);
int l, m;
for(l = 0; l < ksize_h; ++l){
for(m = 0; m < ksize_w; ++m){
int cur_h = h_offset + i*stride_h + l;
int cur_w = w_offset + j*stride_w + m;
int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
int valid = (cur_h >= 0 && cur_h < in_h &&
cur_w >= 0 && cur_w < in_w);
__half val = (valid != 0) ? input[index] : min_val;
max = (val > max) ? val : max;
}
}
output[out_index] = max;
}
template <typename T>
void max_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output){
int num_kernels=channels*batch*out_height*out_width;
max_pool2d_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,ksize_h,ksize_w,stride_h,stride_w,pad_t,pad_l,pad_b,pad_r,out_height,out_width,im,output);
}
#define INSTANTIATEMAX_POOL2D(T) \
template void max_pool2d(hipStream_t stream,const T *im, const int batch, \
const int channels, const int height, const int width, \
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w, \
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width, \
T *output);
INSTANTIATEMAX_POOL2D(float)
INSTANTIATEMAX_POOL2D(half)
template <typename T>
__global__ void avg_pool2d_kernel(const int n, const int in_c,const int in_h,const int in_w,const int ksize_h,const int ksize_w,
const int stride_h,const int stride_w,const int pad_t,const int pad_l,const int pad_b,const int pad_r, const int out_height,const int out_width,
const T *input, T *output)
{
int h = out_height;
int w = out_width;
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;
int w_offset = -pad_l;
int h_offset = -pad_t;
int out_index = j + w*(i + h*(k + c*b));
T sum=0.0;
T zero= 0;
int l, m;
for(l = 0; l < ksize_h; ++l){
for(m = 0; m < ksize_w; ++m){
int cur_h = h_offset + i*stride_h + l;
int cur_w = w_offset + j*stride_w + m;
int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
int valid = (cur_h >= 0 && cur_h < in_h &&
cur_w >= 0 && cur_w < in_w);
sum += (valid != 0) ? input[index] : zero;
}
}
T count= ksize_h*ksize_w;
output[out_index] = sum/(count);
}
template <typename T>
void avg_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output){
int num_kernels=channels*batch*out_height*out_width;
avg_pool2d_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,ksize_h,ksize_w,stride_h,stride_w,pad_t,pad_l,pad_b,pad_r,out_height,out_width,im,output);
//printf("avg_pool2d kernel done\n");
}
#define INSTANTIATEAVG_POOL2D(T) \
template void avg_pool2d(hipStream_t stream,const T *im, const int batch, \
const int channels, const int height, const int width, \
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w, \
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,\
T *output);
INSTANTIATEAVG_POOL2D(float)
INSTANTIATEAVG_POOL2D(half)
__global__ void global_avg_pool2d_kernel2(const int n, const int in_c,const int in_h,const int in_w,const float *input, float *output){
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int i = id % 64; //单通道线程序号
id /= 64;
int k = id % c;//通道 k>=1
id /= c;
int b = id; //batch
float tmp=0.0;
for(int m=i+(in_h*in_w*(k+b*c));m<(k+1+b*c)*in_h*in_w;m+=64)
{
tmp+=input[m];
}
int delta = 1;
for (int j = 0; j < 6; j++) {
tmp += __shfl_down(tmp,delta,64);
delta += delta;
}
__syncthreads();
if(i==0)
{
int out_index=k+c*b;
output[out_index] = tmp/(in_h*in_w);
}
}
template <typename T>
__global__ void global_avg_pool2d_kernel1(const int n, const int in_c,const int in_h,const int in_w,const T *input, T *output){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int k = id % in_c;
id /= in_c;
int b = id ;
int out_index = k + in_c*b;
T sum=0.0;
int l, m;
for(l = 0; l < in_h; ++l){
for(m = 0; m < in_w; ++m){
int index = m+in_w*(l+in_h*(k+b*in_c));
sum += input[index] ;
}
}
T count=in_h*in_w;
output[out_index] = sum/count ;
}
void global_avg_pool2d(hipStream_t stream,const float *im, const int batch,const int channels, const int height, const int width,float *output){
int num_kernels=0;
if ((height*width)>=64)
{ //当大于64的时候可以使用洗牌函数进行优化
num_kernels=channels*batch*64;
global_avg_pool2d_kernel2<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,im,output);
}
else{
num_kernels=channels*batch;
global_avg_pool2d_kernel1<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,im,output);
}
}
//当数据类型是half的时候无法使用洗牌函数,所以将half类型处理与float类型处理分开写
void global_avg_pool2d(hipStream_t stream,const __half *im, const int batch,const int channels, const int height, const int width,__half *output){
int num_kernels=channels*batch;
global_avg_pool2d_kernel1<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,im,output);
}
// #define INSTANTIATEGLOBAL_AVG_POOL2D(T)
// template void global_avg_pool2d(hipStream_t stream,const T *im, const int batch,const int channels,
// const int height, const int width,T *output);
// INSTANTIATEGLOBAL_AVG_POOL2D(float)
// INSTANTIATEGLOBAL_AVG_POOL2D(half)
#ifndef POOL_SUGON_H
#define POOL_SUGON_H
#pragma once
template <typename T>
void max_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output);
template <typename T>
void avg_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output);
void global_avg_pool2d(hipStream_t stream,const float *im, const int batch,const int channels, const int height, const int width,float *output);
void global_avg_pool2d(hipStream_t stream,const __half *im, const int batch,const int channels, const int height, const int width,__half *output);
#endif
\ No newline at end of file
......@@ -8,6 +8,16 @@
#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
#include "core/providers/rocm/math/binary_elementwise_ops.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include "reduction_sugon.cuh"
#include <iostream>
using namespace std;
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wunused-variable"
using namespace onnxruntime::common;
namespace onnxruntime {
......@@ -714,8 +724,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
template <bool allow_multi_axes>
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = ctx->Input<Tensor>(0);
std::vector<int64_t> axes;
std::vector<int64_t> inputshape;
long int *inputshape_ptr=inputshape.data();
std::vector<int64_t> outputshape;
long int *outputshape_ptr=outputshape.data();
size_t num_inputs = ctx->InputCount();
if (num_inputs == 2) {
......@@ -743,6 +760,40 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenR
axes,
prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
//Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual
//MIOPEN_REDUCE_TENSOR_NO_INDICES
//ReduceTensorIndices != MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES &&
if(ReduceTensorIndices != MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES )
{
const auto* Xdata = reinterpret_cast<const HipT*>(X->Data<T>());
auto* Ydata =reinterpret_cast< HipT*>( Y->MutableData<T>());
const TensorShape& x_shape = X->Shape();
const auto x_dims = x_shape.GetDims(); //{N,C,H,W}
inputshape.assign(x_dims.begin(),x_dims.end());
int axes_size=axes.size();
const TensorShape& y_shape = Y->Shape();
const auto y_dims = y_shape.GetDims();
if(x_shape.NumDimensions()<7&&x_shape.NumDimensions()>=1 && miopen_reduce_op<8 && (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) )
{
int x_dims_[6]={1,1,1,1,1,1};
int out_dims_[6]={1,1,1,1,1,1}; //kernel支持最大支持x_shape.NumDimensions()为6
for(int i=0;i<x_shape.NumDimensions();i++)
{
x_dims_[i]=x_dims[i];
out_dims_[i]=prepare_reduce_metadata.output_dims[i];
}
Reduce_Compute<HipT>(Stream(),Xdata,x_dims_[0],x_dims_[1],x_dims_[2],x_dims_[3],x_dims_[4],x_dims_[5],out_dims_[0],out_dims_[1],out_dims_[2],
out_dims_[3],out_dims_[4],out_dims_[5],Ydata,miopen_reduce_op,calculate_log_, calculate_sqt_, log_sum_exp_);
return Status::OK();
}
}
//printf("================still use miopen\n");
const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute();
return ReduceComputeCore<T, ReduceTensorIndices>(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes,
......
#include <hiprand.h>
#include <hip/hip_runtime.h>
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <math.h>
#include "reduction_sugon.cuh"
//reduce计算规则:
//1.当axes为空,noop_with_enpty_axes设置为1的时候,输入等于输出;该情况已经处理;
//2.当axes为空,keepdim为false,noop_with_enpty_axes设置为0,所有维度都进行reduce计算;
//3.当axes为空,keepdim为true,noop_with_enpty_axes设置为0,所有维度都进行reduce计算;形状为[1,1...1]
//4.当axes不为空,且keepdim为真,按照axes进行reduce,且保持维度不变
//5.当axes不为空,且keepdim为假.按照axes进行reduce,且维度会改变
__global__ void ReduceMax_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = -INFINITY;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]>val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceMax_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = __float2half(-INFINITY); //负无穷大
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]>val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceAMax_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = -INFINITY;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(abs(im[im_index])>val)?abs(im[im_index]):val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceAMax_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = __float2half(-INFINITY);
__half zero =0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
__half tmp =(im[im_index]>zero)?im[im_index]:-im[im_index];
val =(tmp>val)?tmp:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceMin_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = INFINITY;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]<val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceMin_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = __float2half(INFINITY);
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]<val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
template <typename T>
__global__ void ReduceSum_kernel(int n,const T *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,T *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
T val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceProd_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 1.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val *=im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceProd_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 1.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val *=__half2float(im[im_index]);
}
}
}
}
}
}
output[index]=__float2half(val);
}
template <typename T>
__global__ void ReduceMean_kernel(int n,const T *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,T *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
T val = 0.0;
T count=0;
T val_ = 0.0;
T count_=0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index];
count++;
}
}
}
}
}
}
output[index]=(val==val_||count==count_)?val_:val/count;
}
__global__ void ReduceL1_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=abs(im[im_index]);
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceL1_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = 0.0;
__half zero = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=(im[im_index]>zero)?im[im_index]:-im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceL2_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index] *im[im_index];
}
}
}
}
}
}
output[index]=sqrt(val);
}
__global__ void ReduceL2_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp= im[im_index] *im[im_index];
val +=tmp;
}
}
}
}
}
}
output[index]=__float2half(sqrt(val));
}
__global__ void ReduceSumSquare_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index]*im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceSumSquare_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp = im[im_index]*im[im_index];
val +=tmp;
}
}
}
}
}
}
output[index]=__float2half(val);
}
__global__ void ReduceLogSum_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=log(im[im_index]);
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceLogSum_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp = __half2float(im[im_index]);
val +=log(tmp);
}
}
}
}
}
}
output[index]=__half2float(val);
}
__global__ void ReduceLogSumExp_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{ //ln(e^x1+e^x2+....)
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=exp(im[im_index]);
}
}
}
}
}
}
output[index]=log(val);
}
__global__ void ReduceLogSumExp_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{ //ln(e^x1+e^x2+....)
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp= __half2float(im[im_index]);
val +=exp(tmp);
}
}
}
}
}
}
output[index]=__float2half(log(val));
}
//miopen对Reduce算子对应关系
//MIOPEN_REDUCE_TENSOR_ADD = 0 | 求和
//MIOPEN_REDUCE_TENSOR_MUL = 1 | 求乘积
//MIOPEN_REDUCE_TENSOR_MIN = 2 | 取最小
//MIOPEN_REDUCE_TENSOR_MAX = 3 | 取最大
//MIOPEN_REDUCE_TENSOR_AMAX= 4 | 取最大绝对值
//MIOPEN_REDUCE_TENSOR_AVG = 5 | 取平均
//MIOPEN_REDUCE_TENSOR_NORM1 = 6 | 取一范数
//MIOPEN_REDUCE_TENSOR_NORM2 = 7 | 取二范数
//参考https://github.com/ROCmSoftwarePlatform/MIOpen/blob/83da5e99d67bd5b6d0c7c48924868c37394136c2/include/miopen/miopen.h#L507
template <typename T>
void Reduce_Compute(hipStream_t stream,const T *im, const int batch,const int channels,const int height,const int width,const int index4,const int index5,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5,
T *output,int ReduceType,const bool calculate_log_,const bool calculate_sqt_,const bool log_sum_exp_){
int num_kernels=output_batch*output_channels*output_height*output_width;
if(ReduceType==0) //ReduceSum
{
if(calculate_log_)
ReduceLogSum_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
else if(calculate_sqt_)
ReduceSumSquare_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
else if(log_sum_exp_)
ReduceLogSumExp_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
else
ReduceSum_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceSum_kernel\n ");
}
else if(ReduceType==1) //ReduceProd
{
ReduceProd_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceProd_kernel\n ");
}
else if(ReduceType==2) //ReduceMin
{
ReduceMin_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceProd_kernel\n ");
}
else if(ReduceType==3) //ReduceMax
{
ReduceMax_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceMax_2D_kernel\n ");
}
else if(ReduceType==4) //ReduceAMax
{
ReduceAMax_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
else if(ReduceType==5) //ReduceMean
{
ReduceMean_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
else if(ReduceType==6)//ReduceL1_kernel
{
ReduceL1_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
else if(ReduceType==7)//ReduceL2_kernel
{
ReduceL2_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
}
#define INSTANTIATEREDUCE_COMPUTE(T) \
template void Reduce_Compute(hipStream_t stream,const T *im, const int batch,const int channels,const int height,const int width,const int index4,const int index5, \
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5, \
T *output,int ReduceType,const bool calculate_log_,const bool calculate_sqt_,const bool log_sum_exp_);
INSTANTIATEREDUCE_COMPUTE(float)
INSTANTIATEREDUCE_COMPUTE(half)
#ifndef REDUCTION_SUGON_H
#define REDUCTION_SUGON_H
#pragma once
template <typename T>
void Reduce_Compute(hipStream_t stream,const T *im, const int batch,const int channels,const int height,const int width,const int index4,const int index5,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5,
T *output,int ReduceType,const bool calculate_log_,const bool calculate_sqt_,const bool log_sum_exp_);
#endif
\ No newline at end of file
......@@ -83,6 +83,32 @@ inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
rocblas_gemm_algo_standard, 0, get_flag());
}
inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m, int n, int k,
const half* alpha,
const half* A, int lda,
const half* B, int ldb,
const half* beta,
float* C, int ldc) {
float h_a = onnxruntime::math::halfToFloat(*reinterpret_cast<const uint16_t*>(alpha));
float h_b = onnxruntime::math::halfToFloat(*reinterpret_cast<const uint16_t*>(beta));
return rocblas_gemm_ex(handle,
transa,
transb,
m, n, k,
&h_a,
A, rocblas_datatype_f16_r, lda,
B, rocblas_datatype_f16_r, ldb,
&h_b,
C, rocblas_datatype_f32_r, ldc,
C, rocblas_datatype_f32_r, ldc,
rocblas_datatype_f32_r,
rocblas_gemm_algo_standard, 0, 0);
}
inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
......
......@@ -7,6 +7,8 @@ import datetime
import platform
import subprocess
import sys
import re
import os
from distutils import log as logger
from distutils.command.build_ext import build_ext as _build_ext
from glob import glob, iglob
......@@ -17,12 +19,76 @@ from shutil import copyfile
from packaging.tags import sys_tags
from setuptools import Extension, setup
from setuptools.command.install import install as InstallCommandBase
from typing import Optional, Union
nightly_build = False
package_name = "onnxruntime"
wheel_name_suffix = None
def get_sha(ort_root: Union[str, Path]) -> str:
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=ort_root).decode('ascii').strip()
except Exception:
return 'Unknown'
def get_abi():
try:
command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
output = result.stdout.strip()
abi = "abi" + output.split(" ")[-1]
return abi
except Exception:
return 'abiUnknown'
def get_version_add(sha: Optional[str] = None) -> str:
version=''
ort_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(ort_root, "DCUORT_VERSION_NUMBER")
if sha != 'Unknown':
if sha is None:
sha = get_sha(ort_root)
version = 'git' + sha[:7]
# abi
version += "." + get_abi()
# dtk version
if os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
with open(rocm_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
rocm_version=lines[0][:-2].replace(".", "")
version += ".dtk" + rocm_version
lines=[]
with open(add_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
lines[0] = "__dcu_version__ = '1.14.0+{}'\n".format(version)
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.writelines(lines)
file.close()
init_path=os.path.join(ort_root, "onnxruntime/__init__.py")
with open(init_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
lines[11] = "__dcu_version__ = '1.14.0+{}'\n".format(version)
with open(init_path, encoding="utf-8",mode="w") as file:
file.writelines(lines)
file.close()
def get_version():
get_version_add()
ort_root = os.path.dirname(os.path.abspath(__file__))
version_file = os.path.join(ort_root, "DCUORT_VERSION_NUMBER")
with open(version_file, encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__dcu_version__']
def parse_arg_remove_boolean(argv, arg_name):
arg_value = False
if arg_name in sys.argv:
......@@ -546,7 +612,7 @@ package_data["onnxruntime"] = data + examples + extra
version_number = ""
with open("VERSION_NUMBER") as f:
version_number = f.readline().strip()
sub_version=""
sub_version="_light"
dtk_version =""
with open("/opt/dtk/.info/rocm_version") as f:
dtk_version = f.readline().strip()
......@@ -675,8 +741,8 @@ if enable_training:
# Setup
setup(
name="onnxruntime",#name=package_name
version=version_number+"+dtk"+dtk_version ,#
name="onnxruntime-lite",#name=package_name
version=get_version() ,#
description="ONNX Runtime is a runtime accelerator for Machine Learning models",
long_description=long_description,
author="Microsoft Corporation",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment