Commit 25d7fde8 authored by gaoqiong's avatar gaoqiong
Browse files

lite

parent 8439d29f
...@@ -8,6 +8,12 @@ ...@@ -8,6 +8,12 @@
#include "core/providers/rocm/miopen_common.h" #include "core/providers/rocm/miopen_common.h"
#include "core/providers/rocm/nn/conv.h" #include "core/providers/rocm/nn/conv.h"
#include "core/providers/cpu/nn/conv_transpose_attributes.h" #include "core/providers/cpu/nn/conv_transpose_attributes.h"
#include "core/common/span_utils.h"
#include "core/providers/rocm/math/gemm.h"
#include "core/providers/cpu/math/gemm_helper.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
#include "core/providers/rocm/tunable/gemm.h"
#include "core/common/safeint.h"
namespace onnxruntime { namespace onnxruntime {
namespace rocm { namespace rocm {
...@@ -18,11 +24,15 @@ class ConvTranspose : public RocmKernel { ...@@ -18,11 +24,15 @@ class ConvTranspose : public RocmKernel {
ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info){}; ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info){};
Status ComputeInternal(OpKernelContext* context) const override; Status ComputeInternal(OpKernelContext* context) const override;
Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const; Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const;
Status ConvTranspose_col2im(OpKernelContext* context, bool dynamic_padding) const;
private: private:
ConvTransposeAttributes conv_transpose_attrs_; ConvTransposeAttributes conv_transpose_attrs_;
mutable MiopenConvState<miopenConvAlgoPerf_t> s_; mutable MiopenConvState<miopenConvAlgoPerf_t> s_;
// for pre-packing usage
TensorShape filter_shape_;
BufferUniquePtr transposed_filter_;
}; };
} // namespace rocm } // namespace rocm
......
#ifndef IM2COL_H
#define IM2COL_H
#pragma once
#include <cassert>
template <typename T>
void im2col_gpu(hipStream_t stream,const T *im,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,
T *data_col);
template <typename T>
void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
template <typename T>
void assign_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
template <typename T>
void assign_val_gpu(hipStream_t stream,T *output,const T val,const int batch,const int c_out,const int out_putsize);
template <typename T>
void col2im_gpu(hipStream_t stream,const float *data_col,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,
T *im);
#endif
#include <hiprand.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include "im2col.cuh"
#include "ort_sugon.cuh"
template <typename T>
__global__ void im2col_gpu_kernel(const int n, const T* data_im,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_t, const int pad_l,const int pad_b,const int pad_r,
const int stride_h,const int stride_w,const int dilation_h,const int dilation_w,
const int output_h, const int output_w,
T *data_col,const T padding_value
) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x){//
int w_out = index % output_w; //线程池的w_out
int h_index = index / output_w;
int h_out = h_index % output_h;
int channel_in = h_index / output_h;
int channel_out = channel_in * ksize_h * ksize_w;
int h_in = h_out * stride_h - pad_t;
int w_in = w_out * stride_w - pad_l;
T* data_col_ptr = data_col;
data_col_ptr += (channel_out * output_h + h_out) * output_w + w_out;
const T* data_im_ptr = data_im;
data_im_ptr += (channel_in * height + h_in) * width + w_in;
for (int i = 0; i < ksize_h; ++i) {
for (int j = 0; j < ksize_w; ++j) {
int h = h_in + i*dilation_h;
int w = w_in + j*dilation_w;
*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
data_im_ptr[i*dilation_h * width + j*dilation_w] : padding_value;
data_col_ptr += output_h * output_w;
}
}
}
}
template <typename T>
void im2col_gpu(hipStream_t stream,const T *im,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,
T *data_col){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
const T padding_value = 0;
int output_h = (height + pad_b + pad_t - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
int output_w = (width + pad_l + pad_r - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * output_h * output_w;
im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
BLOCK,0,stream>>>(
num_kernels, im, height, width, ksize_h,ksize_w, pad_t,pad_l,pad_b,pad_r,
stride_h,stride_w,dilation_h, dilation_w,
output_h,output_w, data_col,padding_value);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at im2col_gpu\n ");
//printf("im2col kernel done\n ");
}
#define INSTANTIATEIM2COL_GPU(T) \
template void im2col_gpu<T>(hipStream_t stream,const T *im,int channels, int height, int width,int ksize_h,int ksize_w, int stride_h,\
int stride_w,int pad_t,int pad_l,int pad_b,int pad_r,int dilation_h,int dilation_w,T *data_col);
INSTANTIATEIM2COL_GPU(float)
INSTANTIATEIM2COL_GPU(half)
template <typename T>
__global__ void add_bias_kernel(T *output,const T *biases,const int batch,const int channels,const int out_putsize)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index >= channels*out_putsize*batch) {
return;
}
int i = index % out_putsize;
index /= out_putsize;
int j = index % channels; //通道索引
index /= channels;
int k = index; //batch s索引
output[(k*channels+j)*out_putsize + i] += biases[j];
}
template <typename T>
void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
{
int num_kernels = c_out*out_putsize*batch;
add_bias_kernel<<<(num_kernels+BLOCK-1)/BLOCK, BLOCK,0,stream>>>(output, biases, batch, c_out, out_putsize);
}
#define INSTANTIATEADD_BIAS_GPU(T) \
template void add_bias_gpu<T>(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
INSTANTIATEADD_BIAS_GPU(float)
INSTANTIATEADD_BIAS_GPU(half)
template <typename T>
__global__ void assign_bias_kernel(T *output,const T *biases,const int batch,const int channels,const int out_putsize)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index >= channels*out_putsize*batch) {
return;
}
int i = index % out_putsize;
index /= out_putsize;
int j = index % channels; //通道索引
index /= channels;
int k = index; //batch s索引
output[(k*channels+j)*out_putsize + i] = biases[j];
}
template <typename T>
void assign_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
{
int num_kernels = c_out*out_putsize*batch;
assign_bias_kernel<<<(num_kernels+BLOCK-1)/BLOCK, BLOCK,0,stream>>>(output, biases, batch, c_out, out_putsize);
}
#define INSTANTIATEASSIGN_BIAS_GPU(T) \
template void assign_bias_gpu<T>(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize);
INSTANTIATEASSIGN_BIAS_GPU(float)
INSTANTIATEASSIGN_BIAS_GPU(half)
template <typename T>
__global__ void assign_val_kernel(T *output,const T val,const int batch,const int channels,const int out_putsize)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index >= channels*out_putsize*batch) {
return;
}
int i = index % out_putsize;
index /= out_putsize;
int j = index % channels; //通道索引
index /= channels;
int k = index; //batch s索引
output[(k*channels+j)*out_putsize + i] = val;
}
template <typename T>
void assign_val_gpu(hipStream_t stream,T *output,const T val,const int batch,const int c_out,const int out_putsize)
{
int num_kernels = c_out*out_putsize*batch;
assign_val_kernel<<<(num_kernels+BLOCK-1)/BLOCK, BLOCK,0,stream>>>(output, val, batch, c_out, out_putsize);
}
#define INSTANTIATEASSIGN_VAL_GPU(T) \
template void assign_val_gpu<T>(hipStream_t stream,T *output,T val,const int batch,const int c_out,const int out_putsize);
INSTANTIATEASSIGN_VAL_GPU(float)
INSTANTIATEASSIGN_VAL_GPU(half)
__global__ void col2im_gpu_kernel(const int n, const float* data_col,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_t, const int pad_l,const int pad_b,const int pad_r,
const int stride_h,const int stride_w,const int dilation_h,const int dilation_w,
const int output_h, const int output_w,
float *data_im
) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x)
{
float val = 0.0;
int w = index % width + pad_l;
int h = (index / width) % height + pad_t;
int c = index / (width * height);
int dkernel_h = dilation_h * (ksize_h - 1) + 1;
int dkernel_w = dilation_w * (ksize_w - 1) + 1;
int w_col_start = (w < dkernel_w) ? 0 : (w - dkernel_w) / stride_w + 1;//横向最早覆盖该元素的滤波器Index
int w_col_end = min(w / stride_w + 1, output_w); //横向最迟覆盖该元素的滤波器Index
int h_col_start = (h < dkernel_h) ? 0 : (h - dkernel_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, output_h);
// int offset =(c * ksize_h * ksize_w + h * ksize_h + w) * output_h * output_w;
// int coeff_h_col = (1 - stride_h * ksize_h * output_h) * output_w;
// int coeff_w_col = (1 - stride_w * output_h * output_w);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
//val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
int k_index_w= (w-w_col*stride_w)%dilation_w;
int k_index_h= (h-h_col*stride_h)%dilation_h;//对应在滤波器中的相对位置是否有余数,有余数则无效
if(k_index_w!=0||k_index_h!=0) continue;
// int offset=(c*ksize_h*ksize_w +k_index_h*ksize_w+ k_index_w)*output_h * output_w;//前面的层所在的偏移位置
// int offset_=(h_col*output_h+w_col);//在当前层所在的偏移位置
// val += data_col[offset+offset_];
int c_col = c * ksize_h*ksize_w + (h - h_col * stride_h)/dilation_h * ksize_h + (w - w_col * stride_w)/dilation_w;
val += data_col[(c_col * output_h + h_col) * output_w + w_col];
}
}
data_im[index] += val;
}
}
__global__ void col2im_gpu_kernel(const int n, const float* data_col,
const int height, const int width, const int ksize_h, const int ksize_w,
const int pad_t, const int pad_l,const int pad_b,const int pad_r,
const int stride_h,const int stride_w,const int dilation_h,const int dilation_w,
const int output_h, const int output_w,
__half *data_im
) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x)
{
float val = __half2float(data_im[index]);
int w = index % width + pad_l;
int h = (index / width) % height + pad_t;
int c = index / (width * height);
int dkernel_h = dilation_h * (ksize_h - 1) + 1;
int dkernel_w = dilation_w * (ksize_w - 1) + 1;
int w_col_start = (w < dkernel_w) ? 0 : (w - dkernel_w) / stride_w + 1;//横向最早覆盖该元素的滤波器Index
int w_col_end = min(w / stride_w + 1, output_w); //横向最迟覆盖该元素的滤波器Index
int h_col_start = (h < dkernel_h) ? 0 : (h - dkernel_h) / stride_h + 1;
int h_col_end = min(h / stride_h + 1, output_h);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int k_index_w= (w-w_col*stride_w)%dilation_w;
int k_index_h= (h-h_col*stride_h)%dilation_h;//对应在滤波器中的相对位置是否有余数,有余数则无效
if(k_index_w!=0||k_index_h!=0) continue;
int c_col = c * ksize_h*ksize_w + (h - h_col * stride_h)/dilation_h * ksize_h + (w - w_col * stride_w)/dilation_w;
val += data_col[(c_col * output_h + h_col) * output_w + w_col];
}
}
data_im[index] = __float2half(val);
}
}
template <typename T>
void col2im_gpu(hipStream_t stream,const float *data_col,
int channels, int height, int width,
int ksize_h,int ksize_w, int stride_h,int stride_w,
int pad_t,int pad_l,int pad_b,int pad_r,
int dilation_h,int dilation_w,T *im){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
const int64_t dkernel_h = dilation_h * (ksize_h - 1) + 1;
const int64_t dkernel_w = dilation_w * (ksize_w - 1) + 1;
int output_h = (height + pad_b + pad_t - dkernel_h) / stride_h + 1;
int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
int num_kernels = channels * height * width;
// dim3 block_size(BLOCK, 1, 1);
// dim3 grid_size((num_kernels + block_size.x - 1) / block_size.x, 1, 1);
// hipLaunchKernelGGL(col2im_gpu_kernel<T>, grid_size, block_size,0,stream,
// num_kernels, data_col, height, width, ksize_h,ksize_w, pad_t,pad_l,pad_b,pad_r,
// stride_h,stride_w,dilation_h, dilation_w,
// output_h,output_w, im );
col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(
num_kernels, data_col, height, width, ksize_h,ksize_w, pad_t,pad_l,pad_b,pad_r,
stride_h,stride_w,dilation_h, dilation_w,output_h,output_w, im);
//printf("col2im_gpu done\n");
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at col2im_gpu\n ");
}
#define INSTANTIATECOL2IM_GPU(T) \
template void col2im_gpu<T>(hipStream_t stream,const float *data_col, \
int channels, int height, int width, \
int ksize_h,int ksize_w, int stride_h,int stride_w, \
int pad_t,int pad_l,int pad_b,int pad_r, \
int dilation_h,int dilation_w, \
T *im);
INSTANTIATECOL2IM_GPU(float)
INSTANTIATECOL2IM_GPU(half)
\ No newline at end of file
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <stdio.h>
#include <stdlib.h>
bool get_miopenenv_miopen()
{
const char *ptr_name = "ORT_MIOPEN_ENABLE";
char *ptr_value = getenv(ptr_name);
if(ptr_value==nullptr || *ptr_value=='0') {
//printf("ORT_MIOPEN_ENABLE :0\n");
return false;
}
else if(*ptr_value=='1'){
//printf("ORT_MIOPEN_ENABLE :1\n");
return true;
}
else return false;
}
#ifndef ORT_SUGON_H
#define ORT_SUGON_H
#define BLOCK 256
bool get_miopenenv_miopen();
//inline bool miopen_enable=get_miopenenv_miopen();
// dim3 hip_gridsize(const int n);
#endif
#include <hiprand.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include "pool_sugon.cuh"
#include "ort_sugon.cuh"
//#include "core/providers/rocm/nn/pool_sugon.cuh"
//#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <math.h>
__global__ void max_pool2d_kernel(const int n, const int in_c,const int in_h,const int in_w,const int ksize_h,const int ksize_w,
const int stride_h,const int stride_w,const int pad_t,const int pad_l,const int pad_b,const int pad_r, const int out_height,const int out_width,
const float *input, float *output)
{
int h = out_height;
int w = out_width;
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;
int w_offset = -pad_l;
int h_offset = -pad_t;
int out_index = j + w*(i + h*(k + c*b));
float max = -INFINITY;
int l, m;
for(l = 0; l < ksize_h; ++l){
for(m = 0; m < ksize_w; ++m){
int cur_h = h_offset + i*stride_h + l;
int cur_w = w_offset + j*stride_w + m;
int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
int valid = (cur_h >= 0 && cur_h < in_h &&
cur_w >= 0 && cur_w < in_w);
float val = (valid != 0) ? input[index] : -INFINITY;
max = (val > max) ? val : max;
}
}
output[out_index] = max;
}
__global__ void max_pool2d_kernel(const int n, const int in_c,const int in_h,const int in_w,const int ksize_h,const int ksize_w,
const int stride_h,const int stride_w,const int pad_t,const int pad_l,const int pad_b,const int pad_r, const int out_height,const int out_width,
const __half *input, __half *output)
{
int h = out_height;
int w = out_width;
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;
int w_offset = -pad_l;
int h_offset = -pad_t;
int out_index = j + w*(i + h*(k + c*b));
__half max = __float2half(-INFINITY);
__half min_val= __float2half(-INFINITY);
int l, m;
for(l = 0; l < ksize_h; ++l){
for(m = 0; m < ksize_w; ++m){
int cur_h = h_offset + i*stride_h + l;
int cur_w = w_offset + j*stride_w + m;
int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
int valid = (cur_h >= 0 && cur_h < in_h &&
cur_w >= 0 && cur_w < in_w);
__half val = (valid != 0) ? input[index] : min_val;
max = (val > max) ? val : max;
}
}
output[out_index] = max;
}
template <typename T>
void max_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output){
int num_kernels=channels*batch*out_height*out_width;
max_pool2d_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,ksize_h,ksize_w,stride_h,stride_w,pad_t,pad_l,pad_b,pad_r,out_height,out_width,im,output);
}
#define INSTANTIATEMAX_POOL2D(T) \
template void max_pool2d(hipStream_t stream,const T *im, const int batch, \
const int channels, const int height, const int width, \
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w, \
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width, \
T *output);
INSTANTIATEMAX_POOL2D(float)
INSTANTIATEMAX_POOL2D(half)
template <typename T>
__global__ void avg_pool2d_kernel(const int n, const int in_c,const int in_h,const int in_w,const int ksize_h,const int ksize_w,
const int stride_h,const int stride_w,const int pad_t,const int pad_l,const int pad_b,const int pad_r, const int out_height,const int out_width,
const T *input, T *output)
{
int h = out_height;
int w = out_width;
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j = id % w;
id /= w;
int i = id % h;
id /= h;
int k = id % c;
id /= c;
int b = id;
int w_offset = -pad_l;
int h_offset = -pad_t;
int out_index = j + w*(i + h*(k + c*b));
T sum=0.0;
T zero= 0;
int l, m;
for(l = 0; l < ksize_h; ++l){
for(m = 0; m < ksize_w; ++m){
int cur_h = h_offset + i*stride_h + l;
int cur_w = w_offset + j*stride_w + m;
int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
int valid = (cur_h >= 0 && cur_h < in_h &&
cur_w >= 0 && cur_w < in_w);
sum += (valid != 0) ? input[index] : zero;
}
}
T count= ksize_h*ksize_w;
output[out_index] = sum/(count);
}
template <typename T>
void avg_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output){
int num_kernels=channels*batch*out_height*out_width;
avg_pool2d_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,ksize_h,ksize_w,stride_h,stride_w,pad_t,pad_l,pad_b,pad_r,out_height,out_width,im,output);
//printf("avg_pool2d kernel done\n");
}
#define INSTANTIATEAVG_POOL2D(T) \
template void avg_pool2d(hipStream_t stream,const T *im, const int batch, \
const int channels, const int height, const int width, \
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w, \
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,\
T *output);
INSTANTIATEAVG_POOL2D(float)
INSTANTIATEAVG_POOL2D(half)
__global__ void global_avg_pool2d_kernel2(const int n, const int in_c,const int in_h,const int in_w,const float *input, float *output){
int c = in_c;
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int i = id % 64; //单通道线程序号
id /= 64;
int k = id % c;//通道 k>=1
id /= c;
int b = id; //batch
float tmp=0.0;
for(int m=i+(in_h*in_w*(k+b*c));m<(k+1+b*c)*in_h*in_w;m+=64)
{
tmp+=input[m];
}
int delta = 1;
for (int j = 0; j < 6; j++) {
tmp += __shfl_down(tmp,delta,64);
delta += delta;
}
__syncthreads();
if(i==0)
{
int out_index=k+c*b;
output[out_index] = tmp/(in_h*in_w);
}
}
template <typename T>
__global__ void global_avg_pool2d_kernel1(const int n, const int in_c,const int in_h,const int in_w,const T *input, T *output){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int k = id % in_c;
id /= in_c;
int b = id ;
int out_index = k + in_c*b;
T sum=0.0;
int l, m;
for(l = 0; l < in_h; ++l){
for(m = 0; m < in_w; ++m){
int index = m+in_w*(l+in_h*(k+b*in_c));
sum += input[index] ;
}
}
T count=in_h*in_w;
output[out_index] = sum/count ;
}
void global_avg_pool2d(hipStream_t stream,const float *im, const int batch,const int channels, const int height, const int width,float *output){
int num_kernels=0;
if ((height*width)>=64)
{ //当大于64的时候可以使用洗牌函数进行优化
num_kernels=channels*batch*64;
global_avg_pool2d_kernel2<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,im,output);
}
else{
num_kernels=channels*batch;
global_avg_pool2d_kernel1<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,im,output);
}
}
//当数据类型是half的时候无法使用洗牌函数,所以将half类型处理与float类型处理分开写
void global_avg_pool2d(hipStream_t stream,const __half *im, const int batch,const int channels, const int height, const int width,__half *output){
int num_kernels=channels*batch;
global_avg_pool2d_kernel1<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,channels,height,width,im,output);
}
// #define INSTANTIATEGLOBAL_AVG_POOL2D(T)
// template void global_avg_pool2d(hipStream_t stream,const T *im, const int batch,const int channels,
// const int height, const int width,T *output);
// INSTANTIATEGLOBAL_AVG_POOL2D(float)
// INSTANTIATEGLOBAL_AVG_POOL2D(half)
#ifndef POOL_SUGON_H
#define POOL_SUGON_H
#pragma once
template <typename T>
void max_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output);
template <typename T>
void avg_pool2d(hipStream_t stream,const T *im, const int batch,
const int channels, const int height, const int width,
const int ksize_h,const int ksize_w, const int stride_h,const int stride_w,
const int pad_t,const int pad_l,const int pad_b,const int pad_r,const int out_height, const int out_width,
T *output);
void global_avg_pool2d(hipStream_t stream,const float *im, const int batch,const int channels, const int height, const int width,float *output);
void global_avg_pool2d(hipStream_t stream,const __half *im, const int batch,const int channels, const int height, const int width,__half *output);
#endif
\ No newline at end of file
...@@ -8,6 +8,16 @@ ...@@ -8,6 +8,16 @@
#include "core/providers/rocm/math/binary_elementwise_ops_impl.h" #include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
#include "core/providers/rocm/math/binary_elementwise_ops.h" #include "core/providers/rocm/math/binary_elementwise_ops.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h" #include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include "reduction_sugon.cuh"
#include <iostream>
using namespace std;
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wunused-variable"
using namespace onnxruntime::common; using namespace onnxruntime::common;
namespace onnxruntime { namespace onnxruntime {
...@@ -714,8 +724,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr ...@@ -714,8 +724,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
template <bool allow_multi_axes> template <bool allow_multi_axes>
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices> template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = ctx->Input<Tensor>(0); const Tensor* X = ctx->Input<Tensor>(0);
std::vector<int64_t> axes; std::vector<int64_t> axes;
std::vector<int64_t> inputshape;
long int *inputshape_ptr=inputshape.data();
std::vector<int64_t> outputshape;
long int *outputshape_ptr=outputshape.data();
size_t num_inputs = ctx->InputCount(); size_t num_inputs = ctx->InputCount();
if (num_inputs == 2) { if (num_inputs == 2) {
...@@ -743,6 +760,40 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenR ...@@ -743,6 +760,40 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, miopenR
axes, axes,
prepare_reduce_metadata)); prepare_reduce_metadata));
Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
//Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual
//MIOPEN_REDUCE_TENSOR_NO_INDICES
//ReduceTensorIndices != MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES &&
if(ReduceTensorIndices != MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES )
{
const auto* Xdata = reinterpret_cast<const HipT*>(X->Data<T>());
auto* Ydata =reinterpret_cast< HipT*>( Y->MutableData<T>());
const TensorShape& x_shape = X->Shape();
const auto x_dims = x_shape.GetDims(); //{N,C,H,W}
inputshape.assign(x_dims.begin(),x_dims.end());
int axes_size=axes.size();
const TensorShape& y_shape = Y->Shape();
const auto y_dims = y_shape.GetDims();
if(x_shape.NumDimensions()<7&&x_shape.NumDimensions()>=1 && miopen_reduce_op<8 && (std::is_same<T, float>::value || std::is_same<T, MLFloat16>::value) )
{
int x_dims_[6]={1,1,1,1,1,1};
int out_dims_[6]={1,1,1,1,1,1}; //kernel支持最大支持x_shape.NumDimensions()为6
for(int i=0;i<x_shape.NumDimensions();i++)
{
x_dims_[i]=x_dims[i];
out_dims_[i]=prepare_reduce_metadata.output_dims[i];
}
Reduce_Compute<HipT>(Stream(),Xdata,x_dims_[0],x_dims_[1],x_dims_[2],x_dims_[3],x_dims_[4],x_dims_[5],out_dims_[0],out_dims_[1],out_dims_[2],
out_dims_[3],out_dims_[4],out_dims_[5],Ydata,miopen_reduce_op,calculate_log_, calculate_sqt_, log_sum_exp_);
return Status::OK();
}
}
//printf("================still use miopen\n");
const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute(); const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute();
return ReduceComputeCore<T, ReduceTensorIndices>(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes, return ReduceComputeCore<T, ReduceTensorIndices>(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes,
......
#include <hiprand.h>
#include <hip/hip_runtime.h>
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <math.h>
#include "reduction_sugon.cuh"
//reduce计算规则:
//1.当axes为空,noop_with_enpty_axes设置为1的时候,输入等于输出;该情况已经处理;
//2.当axes为空,keepdim为false,noop_with_enpty_axes设置为0,所有维度都进行reduce计算;
//3.当axes为空,keepdim为true,noop_with_enpty_axes设置为0,所有维度都进行reduce计算;形状为[1,1...1]
//4.当axes不为空,且keepdim为真,按照axes进行reduce,且保持维度不变
//5.当axes不为空,且keepdim为假.按照axes进行reduce,且维度会改变
__global__ void ReduceMax_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = -INFINITY;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]>val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceMax_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = __float2half(-INFINITY); //负无穷大
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]>val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceAMax_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = -INFINITY;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(abs(im[im_index])>val)?abs(im[im_index]):val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceAMax_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
//该函数与keepdim无关,当keepdim为假时,也可将对应待压缩的轴的shape数值看成1,不影响求坐标
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
//如果该轴需要压缩则该轴为shape对应的大小,否则为1
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = __float2half(-INFINITY);
__half zero =0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
__half tmp =(im[im_index]>zero)?im[im_index]:-im[im_index];
val =(tmp>val)?tmp:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceMin_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = INFINITY;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]<val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceMin_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = __float2half(INFINITY);
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val =(im[im_index]<val)?im[im_index]:val;
}
}
}
}
}
}
output[index]=val;
}
template <typename T>
__global__ void ReduceSum_kernel(int n,const T *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,T *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
T val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceProd_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 1.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val *=im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceProd_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 1.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val *=__half2float(im[im_index]);
}
}
}
}
}
}
output[index]=__float2half(val);
}
template <typename T>
__global__ void ReduceMean_kernel(int n,const T *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,T *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
T val = 0.0;
T count=0;
T val_ = 0.0;
T count_=0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index];
count++;
}
}
}
}
}
}
output[index]=(val==val_||count==count_)?val_:val/count;
}
__global__ void ReduceL1_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=abs(im[im_index]);
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceL1_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
__half val = 0.0;
__half zero = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=(im[im_index]>zero)?im[im_index]:-im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceL2_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index] *im[im_index];
}
}
}
}
}
}
output[index]=sqrt(val);
}
__global__ void ReduceL2_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp= im[im_index] *im[im_index];
val +=tmp;
}
}
}
}
}
}
output[index]=__float2half(sqrt(val));
}
__global__ void ReduceSumSquare_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=im[im_index]*im[im_index];
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceSumSquare_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp = im[im_index]*im[im_index];
val +=tmp;
}
}
}
}
}
}
output[index]=__float2half(val);
}
__global__ void ReduceLogSum_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=log(im[im_index]);
}
}
}
}
}
}
output[index]=val;
}
__global__ void ReduceLogSum_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp = __half2float(im[im_index]);
val +=log(tmp);
}
}
}
}
}
}
output[index]=__half2float(val);
}
__global__ void ReduceLogSumExp_kernel(int n,const float *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,float *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{ //ln(e^x1+e^x2+....)
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
val +=exp(im[im_index]);
}
}
}
}
}
}
output[index]=log(val);
}
__global__ void ReduceLogSumExp_kernel(int n,const __half *im,const int batch,const int channels,const int height,const int width,const int index4,const int index5,__half *output,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5)
{ //ln(e^x1+e^x2+....)
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= n) return;
int j_index5=id % output_index5;
id /= output_index5;
int j_index4=id % output_index4;
id /= output_index4;
int j = id % output_width;
id /= output_width;
int i = id % output_height;
id /= output_height;
int k = id % output_channels;
id /= output_channels;
int b = id;
int index=((((b*output_channels+k)*output_height+i)*output_width+j)*output_index4+j_index4)*output_index5+j_index5;
float val = 0.0;
for(int n=0;n<((batch==output_batch)?1:batch);n++)
{
for(int c=0;c<((channels==output_channels)?1:channels);c++)
{
for(int h=0;h<((height==output_height)?1:height);h++)
{
for(int w=0;w<((width==output_width)?1:width);w++)
{
for(int p=0;p<((index4==output_index4)?1:index4);p++)
{
for(int q=0;q<((index5==output_index5)?1:index5);q++)
{
int im_index=(((((n+b)*channels+c+k)*height+h+i)*width+w+j)*index4+p+j_index4)*index5+q+j_index5;
float tmp= __half2float(im[im_index]);
val +=exp(tmp);
}
}
}
}
}
}
output[index]=__float2half(log(val));
}
//miopen对Reduce算子对应关系
//MIOPEN_REDUCE_TENSOR_ADD = 0 | 求和
//MIOPEN_REDUCE_TENSOR_MUL = 1 | 求乘积
//MIOPEN_REDUCE_TENSOR_MIN = 2 | 取最小
//MIOPEN_REDUCE_TENSOR_MAX = 3 | 取最大
//MIOPEN_REDUCE_TENSOR_AMAX= 4 | 取最大绝对值
//MIOPEN_REDUCE_TENSOR_AVG = 5 | 取平均
//MIOPEN_REDUCE_TENSOR_NORM1 = 6 | 取一范数
//MIOPEN_REDUCE_TENSOR_NORM2 = 7 | 取二范数
//参考https://github.com/ROCmSoftwarePlatform/MIOpen/blob/83da5e99d67bd5b6d0c7c48924868c37394136c2/include/miopen/miopen.h#L507
template <typename T>
void Reduce_Compute(hipStream_t stream,const T *im, const int batch,const int channels,const int height,const int width,const int index4,const int index5,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5,
T *output,int ReduceType,const bool calculate_log_,const bool calculate_sqt_,const bool log_sum_exp_){
int num_kernels=output_batch*output_channels*output_height*output_width;
if(ReduceType==0) //ReduceSum
{
if(calculate_log_)
ReduceLogSum_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
else if(calculate_sqt_)
ReduceSumSquare_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
else if(log_sum_exp_)
ReduceLogSumExp_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
else
ReduceSum_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceSum_kernel\n ");
}
else if(ReduceType==1) //ReduceProd
{
ReduceProd_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceProd_kernel\n ");
}
else if(ReduceType==2) //ReduceMin
{
ReduceMin_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceProd_kernel\n ");
}
else if(ReduceType==3) //ReduceMax
{
ReduceMax_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
//if(hipDeviceSynchronize()) printf("hipDeviceSynchronize failed at ReduceMax_2D_kernel\n ");
}
else if(ReduceType==4) //ReduceAMax
{
ReduceAMax_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
else if(ReduceType==5) //ReduceMean
{
ReduceMean_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
else if(ReduceType==6)//ReduceL1_kernel
{
ReduceL1_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
else if(ReduceType==7)//ReduceL2_kernel
{
ReduceL2_kernel<<<(num_kernels+BLOCK-1)/BLOCK,BLOCK,0,stream>>>(num_kernels,im,batch,channels,height,width,index4,index5,output,output_batch,
output_channels,output_height,output_width,output_index4,output_index5);
}
}
#define INSTANTIATEREDUCE_COMPUTE(T) \
template void Reduce_Compute(hipStream_t stream,const T *im, const int batch,const int channels,const int height,const int width,const int index4,const int index5, \
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5, \
T *output,int ReduceType,const bool calculate_log_,const bool calculate_sqt_,const bool log_sum_exp_);
INSTANTIATEREDUCE_COMPUTE(float)
INSTANTIATEREDUCE_COMPUTE(half)
#ifndef REDUCTION_SUGON_H
#define REDUCTION_SUGON_H
#pragma once
template <typename T>
void Reduce_Compute(hipStream_t stream,const T *im, const int batch,const int channels,const int height,const int width,const int index4,const int index5,
const int output_batch,const int output_channels,const int output_height,const int output_width,const int output_index4,const int output_index5,
T *output,int ReduceType,const bool calculate_log_,const bool calculate_sqt_,const bool log_sum_exp_);
#endif
\ No newline at end of file
...@@ -83,6 +83,32 @@ inline rocblas_status rocblasGemmHelper(rocblas_handle handle, ...@@ -83,6 +83,32 @@ inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
rocblas_gemm_algo_standard, 0, get_flag()); rocblas_gemm_algo_standard, 0, get_flag());
} }
inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
rocblas_operation transa,
rocblas_operation transb,
int m, int n, int k,
const half* alpha,
const half* A, int lda,
const half* B, int ldb,
const half* beta,
float* C, int ldc) {
float h_a = onnxruntime::math::halfToFloat(*reinterpret_cast<const uint16_t*>(alpha));
float h_b = onnxruntime::math::halfToFloat(*reinterpret_cast<const uint16_t*>(beta));
return rocblas_gemm_ex(handle,
transa,
transb,
m, n, k,
&h_a,
A, rocblas_datatype_f16_r, lda,
B, rocblas_datatype_f16_r, ldb,
&h_b,
C, rocblas_datatype_f32_r, ldc,
C, rocblas_datatype_f32_r, ldc,
rocblas_datatype_f32_r,
rocblas_gemm_algo_standard, 0, 0);
}
inline rocblas_status rocblasGemmHelper(rocblas_handle handle, inline rocblas_status rocblasGemmHelper(rocblas_handle handle,
rocblas_operation transa, rocblas_operation transa,
rocblas_operation transb, rocblas_operation transb,
......
...@@ -7,6 +7,8 @@ import datetime ...@@ -7,6 +7,8 @@ import datetime
import platform import platform
import subprocess import subprocess
import sys import sys
import re
import os
from distutils import log as logger from distutils import log as logger
from distutils.command.build_ext import build_ext as _build_ext from distutils.command.build_ext import build_ext as _build_ext
from glob import glob, iglob from glob import glob, iglob
...@@ -17,12 +19,76 @@ from shutil import copyfile ...@@ -17,12 +19,76 @@ from shutil import copyfile
from packaging.tags import sys_tags from packaging.tags import sys_tags
from setuptools import Extension, setup from setuptools import Extension, setup
from setuptools.command.install import install as InstallCommandBase from setuptools.command.install import install as InstallCommandBase
from typing import Optional, Union
nightly_build = False nightly_build = False
package_name = "onnxruntime" package_name = "onnxruntime"
wheel_name_suffix = None wheel_name_suffix = None
def get_sha(ort_root: Union[str, Path]) -> str:
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=ort_root).decode('ascii').strip()
except Exception:
return 'Unknown'
def get_abi():
try:
command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI"
result = subprocess.run(command, shell=True, capture_output=True, text=True)
output = result.stdout.strip()
abi = "abi" + output.split(" ")[-1]
return abi
except Exception:
return 'abiUnknown'
def get_version_add(sha: Optional[str] = None) -> str:
version=''
ort_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(ort_root, "DCUORT_VERSION_NUMBER")
if sha != 'Unknown':
if sha is None:
sha = get_sha(ort_root)
version = 'git' + sha[:7]
# abi
version += "." + get_abi()
# dtk version
if os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
with open(rocm_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
rocm_version=lines[0][:-2].replace(".", "")
version += ".dtk" + rocm_version
lines=[]
with open(add_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
lines[0] = "__dcu_version__ = '1.14.0+{}'\n".format(version)
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.writelines(lines)
file.close()
init_path=os.path.join(ort_root, "onnxruntime/__init__.py")
with open(init_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
lines[11] = "__dcu_version__ = '1.14.0+{}'\n".format(version)
with open(init_path, encoding="utf-8",mode="w") as file:
file.writelines(lines)
file.close()
def get_version():
get_version_add()
ort_root = os.path.dirname(os.path.abspath(__file__))
version_file = os.path.join(ort_root, "DCUORT_VERSION_NUMBER")
with open(version_file, encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__dcu_version__']
def parse_arg_remove_boolean(argv, arg_name): def parse_arg_remove_boolean(argv, arg_name):
arg_value = False arg_value = False
if arg_name in sys.argv: if arg_name in sys.argv:
...@@ -546,7 +612,7 @@ package_data["onnxruntime"] = data + examples + extra ...@@ -546,7 +612,7 @@ package_data["onnxruntime"] = data + examples + extra
version_number = "" version_number = ""
with open("VERSION_NUMBER") as f: with open("VERSION_NUMBER") as f:
version_number = f.readline().strip() version_number = f.readline().strip()
sub_version="" sub_version="_light"
dtk_version ="" dtk_version =""
with open("/opt/dtk/.info/rocm_version") as f: with open("/opt/dtk/.info/rocm_version") as f:
dtk_version = f.readline().strip() dtk_version = f.readline().strip()
...@@ -675,8 +741,8 @@ if enable_training: ...@@ -675,8 +741,8 @@ if enable_training:
# Setup # Setup
setup( setup(
name="onnxruntime",#name=package_name name="onnxruntime-lite",#name=package_name
version=version_number+"+dtk"+dtk_version ,# version=get_version() ,#
description="ONNX Runtime is a runtime accelerator for Machine Learning models", description="ONNX Runtime is a runtime accelerator for Machine Learning models",
long_description=long_description, long_description=long_description,
author="Microsoft Corporation", author="Microsoft Corporation",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment