Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
25d7fde8
Commit
25d7fde8
authored
Jul 25, 2023
by
gaoqiong
Browse files
lite
parent
8439d29f
Changes
312
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
463 additions
and
11 deletions
+463
-11
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
+0
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
...e/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
+0
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
.../onnxruntime/core/providers/rocm/test/beam_search_topk.cc
+0
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
.../core/providers/rocm/test/rocm_execution_provider_test.cc
+0
-0
onnxruntime/__init__.py
onnxruntime/__init__.py
+1
-0
onnxruntime/core/providers/cpu/nn/batch_norm.h
onnxruntime/core/providers/cpu/nn/batch_norm.h
+1
-1
onnxruntime/core/providers/cpu/nn/conv.cc
onnxruntime/core/providers/cpu/nn/conv.cc
+9
-0
onnxruntime/core/providers/cpu/nn/conv_transpose.cc
onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+1
-0
onnxruntime/core/providers/cpu/nn/pool_attributes.h
onnxruntime/core/providers/cpu/nn/pool_attributes.h
+14
-0
onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
...me/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
+2
-2
onnxruntime/core/providers/cuda/cu_inc/elementwise_impl.cuh
onnxruntime/core/providers/cuda/cu_inc/elementwise_impl.cuh
+1
-1
onnxruntime/core/providers/cuda/tensor/concat_impl.cu
onnxruntime/core/providers/cuda/tensor/concat_impl.cu
+1
-1
onnxruntime/core/providers/cuda/tensor/slice_impl.cu
onnxruntime/core/providers/cuda/tensor/slice_impl.cu
+1
-1
onnxruntime/core/providers/cuda/tensor/split_impl.cu
onnxruntime/core/providers/cuda/tensor/split_impl.cu
+1
-1
onnxruntime/core/providers/cuda/tensor/tile_impl.cu
onnxruntime/core/providers/cuda/tensor/tile_impl.cu
+1
-1
onnxruntime/core/providers/rocm/nn/bn_sugon.cu
onnxruntime/core/providers/rocm/nn/bn_sugon.cu
+94
-0
onnxruntime/core/providers/rocm/nn/bn_sugon.cuh
onnxruntime/core/providers/rocm/nn/bn_sugon.cuh
+9
-0
onnxruntime/core/providers/rocm/nn/conv.cc
onnxruntime/core/providers/rocm/nn/conv.cc
+197
-0
onnxruntime/core/providers/rocm/nn/conv.h
onnxruntime/core/providers/rocm/nn/conv.h
+1
-0
onnxruntime/core/providers/rocm/nn/conv_transpose.cc
onnxruntime/core/providers/rocm/nn/conv_transpose.cc
+129
-3
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
100644 → 100755
View file @
25d7fde8
File mode changed from 100644 to 100755
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
100644 → 100755
View file @
25d7fde8
File mode changed from 100644 to 100755
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
100644 → 100755
View file @
25d7fde8
File mode changed from 100644 to 100755
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
100644 → 100755
View file @
25d7fde8
File mode changed from 100644 to 100755
onnxruntime/__init__.py
View file @
25d7fde8
...
...
@@ -9,6 +9,7 @@ or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
"""
__version__
=
"1.14.0"
__author__
=
"Microsoft"
__dcu_version__
=
"1.14.0+git556e6af.abi0.dtk2304"
# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
# in order to know whether the onnxruntime package is for training it needs
...
...
onnxruntime/core/providers/cpu/nn/batch_norm.h
View file @
25d7fde8
...
...
@@ -191,7 +191,7 @@ class BatchNorm : public OpKernel {
EigenArrayMap
<
T
>
Y_arr
(
Y
->
MutableData
<
T
>
(),
is_spatial_
?
sample_size
:
sample_size_incl_all_channels
,
is_spatial_
?
N
*
C
:
N
);
if
(
is_spatial_
)
{
// spatial == 1
for
(
size_t
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
Y_arr
.
col
(
nc
)
=
X_arr
.
col
(
nc
)
*
new_scale
(
nc
%
C
)
+
new_bias
(
nc
%
C
);
...
...
onnxruntime/core/providers/cpu/nn/conv.cc
View file @
25d7fde8
...
...
@@ -73,6 +73,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
const
size_t
kernel_rank
=
kernel_shape
.
size
();
BufferUniquePtr
col_buffer
;
//printf("***********<typename T>***********");
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
...
...
@@ -126,6 +127,13 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
}
}
// std::cout<<"col_buffer_data:"<<std::endl;
// for(int i=0;i<200;++i)
// {
// printf("%f ",col_buffer_data[i]);
// }
// printf("\n");
math
::
Gemm
<
T
>
(
CblasNoTrans
,
CblasNoTrans
,
...
...
@@ -163,6 +171,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
const
int64_t
C
=
X
->
Shape
()[
1
];
const
int64_t
M
=
W
->
Shape
()[
0
];
ORT_RETURN_IF_ERROR
(
conv_attrs_
.
ValidateInputShape
(
X
,
W
));
//printf("##############float###############");
// kernel_shape is an optional attribute and has to be inferred from W if not provided
TensorShapeVector
kernel_shape
;
...
...
onnxruntime/core/providers/cpu/nn/conv_transpose.cc
View file @
25d7fde8
...
...
@@ -21,6 +21,7 @@
#include "core/common/safeint.h"
#include "core/util/math.h"
#include "core/util/math_cpuonly.h"
#include <iostream>
namespace
onnxruntime
{
...
...
onnxruntime/core/providers/cpu/nn/pool_attributes.h
View file @
25d7fde8
...
...
@@ -28,6 +28,14 @@ struct PoolAttributes {
const
std
::
string
&
op_name
,
int
start_version
)
:
global_pooling
(
IsGlobalPooling
(
op_name
))
{
if
(
global_pooling
)
{
if
(
op_name
==
"GlobalAveragePool"
)
{
global_average_pooling
=
true
;
}
if
(
op_name
==
"GlobalMaxPool"
)
{
global_max_pooling
=
true
;
}
return
;
}
...
...
@@ -62,11 +70,13 @@ struct PoolAttributes {
if
(
op_name
==
"AveragePool"
)
{
int64_t
temp
;
average_pooling
=
true
;
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"count_include_pad"
,
&
temp
).
IsOK
());
count_include_pad
=
(
temp
!=
0
);
}
if
(
op_name
==
"MaxPool"
)
{
max_pooling
=
true
;
if
(
start_version
>=
8
)
{
ORT_ENFORCE
(
info
.
GetAttr
(
"storage_order"
,
&
storage_order
).
IsOK
());
}
...
...
@@ -84,6 +94,10 @@ struct PoolAttributes {
}
const
bool
global_pooling
;
bool
max_pooling
=
false
;
bool
global_max_pooling
=
false
;
bool
average_pooling
=
false
;
bool
global_average_pooling
=
false
;
bool
count_include_pad
{};
int64_t
storage_order
{
0
};
// MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.
...
...
onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh
View file @
25d7fde8
...
...
@@ -191,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl(
#ifdef USE_ROCM
const
int
num_elements_per_thread
=
2
;
const
int
num_threads_per_block
=
512
;
const
int
num_threads_per_block
=
256
;
#else
const
int
num_elements_per_thread
=
GridDim
::
maxElementsPerThread
;
const
int
num_threads_per_block
=
GridDim
::
maxThreadsPerBlock
;
...
...
@@ -227,7 +227,7 @@ void BinaryElementWiseImpl(
#ifdef USE_ROCM
const
int
num_elements_per_thread
=
2
;
const
int
num_threads_per_block
=
512
;
const
int
num_threads_per_block
=
256
;
#else
const
int
num_elements_per_thread
=
GridDim
::
maxElementsPerThread
;
const
int
num_threads_per_block
=
GridDim
::
maxThreadsPerBlock
;
...
...
onnxruntime/core/providers/cuda/cu_inc/elementwise_impl.cuh
View file @
25d7fde8
...
...
@@ -10,7 +10,7 @@ namespace cuda {
#ifdef USE_ROCM
constexpr
int
kElementsPerThread
=
2
;
constexpr
int
kThreadsPerBlock
=
512
;
constexpr
int
kThreadsPerBlock
=
256
;
#else
constexpr
int
kElementsPerThread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
kThreadsPerBlock
=
GridDim
::
maxThreadsPerBlock
;
...
...
onnxruntime/core/providers/cuda/tensor/concat_impl.cu
View file @
25d7fde8
...
...
@@ -12,7 +12,7 @@ namespace cuda {
namespace
{
#ifdef USE_ROCM
constexpr
int
kNumElementsPerThread
=
2
;
constexpr
int
kNumThreadsPerBlock
=
512
;
constexpr
int
kNumThreadsPerBlock
=
256
;
#else
constexpr
int
kNumElementsPerThread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
kNumThreadsPerBlock
=
GridDim
::
maxThreadsPerBlock
;
...
...
onnxruntime/core/providers/cuda/tensor/slice_impl.cu
View file @
25d7fde8
...
...
@@ -11,7 +11,7 @@ namespace cuda {
namespace
{
#ifdef USE_ROCM
constexpr
int
kNumElementsPerThread
=
2
;
constexpr
int
kNumThreadsPerBlock
=
512
;
constexpr
int
kNumThreadsPerBlock
=
256
;
#else
constexpr
int
kNumElementsPerThread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
kNumThreadsPerBlock
=
GridDim
::
maxThreadsPerBlock
;
...
...
onnxruntime/core/providers/cuda/tensor/split_impl.cu
View file @
25d7fde8
...
...
@@ -12,7 +12,7 @@ namespace cuda {
namespace
{
#ifdef USE_ROCM
constexpr
int
kNumElementsPerThread
=
2
;
constexpr
int
kNumThreadsPerBlock
=
512
;
constexpr
int
kNumThreadsPerBlock
=
256
;
#else
constexpr
int
kNumElementsPerThread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
kNumThreadsPerBlock
=
GridDim
::
maxThreadsPerBlock
;
...
...
onnxruntime/core/providers/cuda/tensor/tile_impl.cu
View file @
25d7fde8
...
...
@@ -9,7 +9,7 @@ namespace cuda {
#ifdef USE_ROCM
constexpr
int
num_elements_per_thread
=
2
;
constexpr
int
num_threads_per_block
=
512
;
constexpr
int
num_threads_per_block
=
256
;
#else
constexpr
int
num_elements_per_thread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
num_threads_per_block
=
GridDim
::
maxThreadsPerBlock
;
...
...
onnxruntime/core/providers/rocm/nn/bn_sugon.cu
0 → 100755
View file @
25d7fde8
#include <hiprand.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include "ort_sugon.cuh"
#include <math.h>
#include "bn_sugon.cuh"
__global__
void
batch_normal_kernel
(
int
n
,
const
float
*
im
,
const
float
*
scale
,
const
float
*
bias
,
const
float
*
mean
,
const
float
*
var
,
float
*
output
,
const
int
batch
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
index4
,
const
int
index5
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
n
)
return
;
int
j_index5
=
id
%
index5
;
id
/=
index5
;
int
j_index4
=
id
%
index4
;
id
/=
index4
;
int
j
=
id
%
width
;
//列
id
/=
width
;
int
i
=
id
%
height
;
id
/=
height
;
int
k
=
id
%
channels
;
id
/=
channels
;
int
b
=
id
;
float
epsilon
=
0.00001
;
int
input_index
=
j_index5
+
index5
*
(
j_index4
+
index4
*
(
j
+
width
*
(
i
+
height
*
(
k
+
b
*
channels
))));
//hsqrt支持半精度开方计算 __float2half __half2float
output
[
input_index
]
=
(
im
[
input_index
]
-
mean
[
k
])
/
sqrt
(
var
[
k
]
+
epsilon
)
*
scale
[
k
]
+
bias
[
k
];
}
__global__
void
batch_normal_kernel
(
int
n
,
const
__half
*
im
,
const
__half
*
scale
,
const
__half
*
bias
,
const
__half
*
mean
,
const
__half
*
var
,
__half
*
output
,
const
int
batch
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
index4
,
const
int
index5
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
n
)
return
;
int
j_index5
=
id
%
index5
;
id
/=
index5
;
int
j_index4
=
id
%
index4
;
id
/=
index4
;
int
j
=
id
%
width
;
//列
id
/=
width
;
int
i
=
id
%
height
;
id
/=
height
;
int
k
=
id
%
channels
;
id
/=
channels
;
int
b
=
id
;
// We can fuse the output computation as follows:
// ((x - est_mean) * (inv_var) * scale + bias
// to
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
int
input_index
=
j_index5
+
index5
*
(
j_index4
+
index4
*
(
j
+
width
*
(
i
+
height
*
(
k
+
b
*
channels
))));
//hsqrt支持半精度开方计算 __float2half __half2float
const
float
val1
=
var
[
k
];
const
float
scale1
=
scale
[
k
];
const
float
mean1
=
mean
[
k
];
const
float
input
=
im
[
input_index
];
const
float
bias1
=
bias
[
k
];
const
float
epsilon
=
0.00001
;
const
float
new_scale
=
scale1
/
sqrt
(
val1
+
epsilon
);
const
float
new_bias
=
bias1
-
mean1
*
new_scale
;
const
float
tmp
=
input
*
new_scale
+
new_bias
;
//output[input_index]=(im[input_index]-mean[k])/hsqrt(var[k]+epsilon) *scale[k]+bias[k];
output
[
input_index
]
=
__float2half
(
tmp
);
}
template
<
typename
T
>
void
batch_normal
(
hipStream_t
stream
,
const
T
*
im
,
const
T
*
scale
,
const
T
*
bias
,
const
T
*
mean
,
const
T
*
var
,
T
*
output
,
const
int
batch
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
index4
,
const
int
index5
){
int
num_kernels
=
channels
*
batch
*
height
*
width
*
index4
*
index5
;
batch_normal_kernel
<<<
(
num_kernels
+
BLOCK
-
1
)
/
BLOCK
,
BLOCK
,
0
,
stream
>>>
(
num_kernels
,
im
,
scale
,
bias
,
mean
,
var
,
output
,
batch
,
channels
,
height
,
width
,
index4
,
index5
);
}
#define INSTANTIATEBATCH_NORMAL(T) \
template void batch_normal(hipStream_t stream, const T *im, const T *scale,const T *bias, const T *mean,const T *var, T *output,const int batch, \
const int channels,const int height,const int width,const int index4,const int index5);
INSTANTIATEBATCH_NORMAL
(
float
)
INSTANTIATEBATCH_NORMAL
(
half
)
onnxruntime/core/providers/rocm/nn/bn_sugon.cuh
0 → 100755
View file @
25d7fde8
#ifndef BN_SUGON_H
#define BN_SUGON_H
#pragma once
template
<
typename
T
>
void
batch_normal
(
hipStream_t
stream
,
const
T
*
im
,
const
T
*
scale
,
const
T
*
bias
,
const
T
*
mean
,
const
T
*
var
,
T
*
output
,
const
int
batch
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
index4
,
const
int
index5
);
#endif
\ No newline at end of file
onnxruntime/core/providers/rocm/nn/conv.cc
View file @
25d7fde8
...
...
@@ -4,9 +4,21 @@
#include "core/providers/rocm/nn/conv.h"
#include "core/common/span_utils.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/math/gemm.h"
#include "core/providers/cpu/math/gemm_helper.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
#include "core/providers/rocm/tunable/gemm.h"
#include "core/providers/rocm/tensor/slice.h"
#include "core/providers/rocm/nn/im2col.cuh"
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <iostream>
using
namespace
std
;
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wunused-variable"
namespace
onnxruntime
{
namespace
rocm
{
...
...
@@ -87,6 +99,173 @@ Status SliceOutUnwantedOutputSection(hipStream_t stream,
return
SliceRocm
::
Impl
(
stream
,
input_data
,
input_dims
,
output_data
,
compute_metadata
,
element_size
);
}
template
<
typename
T
>
Status
Conv
<
T
>::
conv_im2col
(
OpKernelContext
*
context
,
bool
bias_expected
)
const
{
//set X
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
W
=
context
->
Input
<
Tensor
>
(
1
);
const
Tensor
*
B
=
context
->
InputCount
()
>=
3
?
context
->
Input
<
Tensor
>
(
2
)
:
nullptr
;
const
Tensor
*
Sum
=
context
->
InputCount
()
>=
4
?
context
->
Input
<
Tensor
>
(
3
)
:
nullptr
;
const
int64_t
N
=
X
->
Shape
()[
0
];
const
int64_t
C
=
X
->
Shape
()[
1
];
const
int64_t
M
=
W
->
Shape
()[
0
];
ORT_RETURN_IF_ERROR
(
conv_attrs_
.
ValidateInputShape
(
X
,
W
));
// kernel_shape is an optional attribute and has to be inferred from W if not provided
TensorShapeVector
kernel_shape
;
ORT_RETURN_IF_ERROR
(
conv_attrs_
.
ComputeKernelShape
(
W
->
Shape
(),
kernel_shape
));
ConvAttributes
::
ConvPadVector
pads
(
conv_attrs_
.
pads
);
if
(
pads
.
empty
())
{
pads
.
resize
(
kernel_shape
.
size
()
*
2
,
0
);
}
TensorShapeVector
dilations
(
conv_attrs_
.
dilations
);
if
(
dilations
.
empty
())
{
dilations
.
resize
(
kernel_shape
.
size
(),
1
);
}
TensorShapeVector
strides
(
conv_attrs_
.
strides
);
if
(
strides
.
empty
())
{
strides
.
resize
(
kernel_shape
.
size
(),
1
);
}
TensorShapeVector
Y_dims
({
N
,
M
});
TensorShape
input_shape
=
X
->
Shape
().
Slice
(
2
);
ORT_RETURN_IF_ERROR
(
conv_attrs_
.
InferPadsAndOutputShape
(
input_shape
,
kernel_shape
,
strides
,
dilations
,
pads
,
Y_dims
));
Tensor
*
Y
=
context
->
Output
(
0
,
TensorShape
(
Y_dims
));
TensorShape
output_shape
=
Y
->
Shape
().
Slice
(
2
);
// Bail out early if one of the dimensions is zero.
if
(
Y
->
Shape
().
Size
()
==
0
)
{
return
Status
::
OK
();
}
AllocatorPtr
alloc
;
ORT_RETURN_IF_ERROR
(
context
->
GetTempSpaceAllocator
(
&
alloc
));
const
auto
*
Xdata
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
//const auto* Xdata = X->Data<float>();
//const auto* Bdata = B != nullptr ? B->Data<float>() : nullptr;
const
auto
*
Wdata
=
reinterpret_cast
<
const
HipT
*>
(
W
->
Data
<
T
>
());
const
auto
*
Bdata
=
(
B
!=
nullptr
)
?
reinterpret_cast
<
const
HipT
*>
(
B
->
Data
<
T
>
())
:
nullptr
;
//auto* Ydata = Y->MutableData<float>();
auto
*
Ydata
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
// Check for the optional Conv/Sum fusion.
//float Beta = 0.0f;
if
(
Sum
!=
nullptr
)
{
const
auto
&
sum_shape
=
Sum
->
Shape
();
ORT_RETURN_IF_NOT
(
Y
->
Shape
()
==
sum_shape
,
"output and sum shape must match"
);
// If the output was not allocated inplace with the sum tensor, then copy here.
const
auto
*
sum_data
=
reinterpret_cast
<
const
HipT
*>
(
Sum
->
Data
<
T
>
());
if
(
Ydata
!=
sum_data
)
{
hipMemcpy
(
Ydata
,
sum_data
,
SafeInt
<
size_t
>
(
sum_shape
.
Size
())
*
sizeof
(
HipT
),
hipMemcpyDeviceToDevice
);
}
// Beta = 1.0f;
}
const
size_t
kernel_rank
=
kernel_shape
.
size
();
const
int64_t
input_image_size
=
input_shape
.
Size
();
const
int64_t
output_image_size
=
output_shape
.
Size
();
const
int64_t
kernel_size
=
TensorShape
(
kernel_shape
).
Size
();
const
int64_t
X_offset
=
C
/
conv_attrs_
.
group
*
input_image_size
;
const
int64_t
Y_offset
=
Y
->
Shape
().
Size
()
/
Y
->
Shape
()[
0
]
/
conv_attrs_
.
group
;
const
int64_t
W_offset
=
W
->
Shape
().
Size
()
/
conv_attrs_
.
group
;
const
int64_t
kernel_dim
=
C
/
conv_attrs_
.
group
*
kernel_size
;
const
int64_t
single_col_buffer_size
=
kernel_dim
*
output_image_size
;
//展开IM2col过程所需要的临时变量
const
int64_t
col_buffer_size
=
kernel_dim
*
conv_attrs_
.
group
*
output_image_size
;
const
int64_t
im2col_X_offset
=
C
*
input_image_size
;
auto
*
col_data
=
alloc
->
Alloc
(
sizeof
(
HipT
)
*
SafeInt
<
size_t
>
(
col_buffer_size
));
BufferUniquePtr
col_buffer
(
col_data
,
BufferDeleter
(
std
::
move
(
alloc
)));
auto
*
col_buffer_data
=
reinterpret_cast
<
HipT
*>
(
col_buffer
.
get
());
//static_cast
const
HipT
zero
=
ToHipType
<
T
>::
FromFloat
(
0.
f
);
const
float
alpha
=
1.0
f
;
//const float beta = 0.0f;
if
(
kernel_rank
==
2
||
kernel_rank
==
1
){
if
(
Bdata
!=
nullptr
)
{
assign_bias_gpu
<
HipT
>
(
Stream
(),
Ydata
,
Bdata
,
N
,
M
,
output_image_size
);
}
else
{
//hipMemset(Ydata,zero,output_image_size); //将ydata初始化为0
assign_val_gpu
<
HipT
>
(
Stream
(),
Ydata
,
zero
,
N
,
M
,
output_image_size
);
}
for
(
int
image_id
=
0
;
image_id
<
N
;
++
image_id
)
{
auto
*
temp_b
=
col_buffer_data
;
auto
*
im_src
=
reinterpret_cast
<
const
HipT
*>
(
Xdata
+
(
image_id
)
*
im2col_X_offset
);
//X
if
(
kernel_rank
==
2
)
im2col_gpu
<
HipT
>
(
Stream
(),
im_src
,
C
,
input_shape
[
0
],
input_shape
[
1
],
kernel_shape
[
0
],
kernel_shape
[
1
],
strides
[
0
],
strides
[
1
],
pads
[
0
],
pads
[
1
],
pads
[
2
],
pads
[
3
],
dilations
[
0
],
dilations
[
1
],
temp_b
);
else
if
(
kernel_rank
==
1
)
im2col_gpu
<
HipT
>
(
Stream
(),
im_src
,
C
,
1
,
input_shape
[
0
],
1
,
kernel_shape
[
0
],
1
,
strides
[
0
],
0
,
pads
[
0
],
0
,
pads
[
1
],
1
,
dilations
[
0
],
temp_b
);
//最后一个0是padding的value
auto
*
a
=
Wdata
;
//W
auto
*
b
=
col_buffer_data
;
auto
*
c
=
Ydata
+
(
image_id
*
conv_attrs_
.
group
)
*
Y_offset
;
const
int
stride_A
=
M
/
conv_attrs_
.
group
*
kernel_dim
;
const
int
stride_B
=
output_image_size
*
kernel_dim
;
const
int
stride_C
=
M
/
conv_attrs_
.
group
*
output_image_size
;
ROCBLAS_RETURN_IF_ERROR
(
rocblasGemmStridedBatchedHelper
(
RocblasHandle
(),
rocblas_operation_none
,
rocblas_operation_none
,
static_cast
<
int
>
(
output_image_size
),
static_cast
<
int
>
(
M
/
conv_attrs_
.
group
),
static_cast
<
int
>
(
kernel_dim
),
&
alpha
,
b
,
static_cast
<
int
>
(
output_image_size
),
stride_B
,
//x
a
,
static_cast
<
int
>
(
kernel_dim
),
stride_A
,
//w
&
alpha
,
c
,
static_cast
<
int
>
(
output_image_size
),
stride_C
,
static_cast
<
int
>
(
conv_attrs_
.
group
)));
// for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
// auto *a = W->Data<float>() + (group_id) * W_offset; //W
// auto *b = col_buffer_data+(group_id)*single_col_buffer_size;
// //auto *im = Xdata + (image_id*conv_attrs_.group+group_id)*X_offset; //X
// auto *c = Ydata + (image_id*conv_attrs_.group+group_id)*Y_offset;
// const float alpha = 1.0;
// const float beta = 0.0;
// ROCBLAS_RETURN_IF_ERROR(rocblasGemmHelper(
// RocblasHandle(),
// rocblas_operation_none,
// rocblas_operation_none,
// output_image_size,M/conv_attrs_.group, kernel_dim,
// &alpha,
// b, output_image_size,
// a, kernel_dim,
// &beta,
// c, output_image_size));
// }
}
// if (Bdata!=nullptr)
// {
// //void add_bias_gpu(hipStream_t stream,T *output,const T *biases,const int batch,const int c_out,const int out_putsize)
// add_bias_gpu<HipT>(Stream(),Ydata, Bdata, static_cast<int>(N), static_cast<int>(M), static_cast<int>(output_image_size));
// }
}
return
Status
::
OK
();
}
template
<
typename
T
>
Status
Conv
<
T
>::
UpdateState
(
OpKernelContext
*
context
,
bool
bias_expected
)
const
{
//set X
...
...
@@ -260,6 +439,7 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
}
const
auto
&
perf
=
s_
.
cached_benchmark_fwd_results
.
at
(
x_dims_miopen
);
s_
.
fwd_algo
=
perf
.
fwd_algo
;
s_
.
workspace_bytes
=
perf
.
memory
;
}
else
{
//set Y
...
...
@@ -280,7 +460,24 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
template
<
typename
T
>
Status
Conv
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
std
::
lock_guard
<
OrtMutex
>
lock
(
s_
.
mutex
);
//判断是否为二维卷积,假如是二维卷积的话,将采用im2col+gemm的形式进行卷积计算
const
Tensor
*
W
=
context
->
Input
<
Tensor
>
(
1
);
TensorShapeVector
kernel_shape
;
ORT_RETURN_IF_ERROR
(
conv_attrs_
.
ComputeKernelShape
(
W
->
Shape
(),
kernel_shape
));
const
size_t
kernel_rank
=
kernel_shape
.
size
();
if
(
kernel_rank
==
2
||
kernel_rank
==
1
)
{
//std::cout<<"conv compute with im2col+gemm"<<std::endl;
ORT_RETURN_IF_ERROR
(
conv_im2col
(
context
));
return
Status
::
OK
();
}
//否则通过Miopen进行卷积计算
//std::cout<<"conv compute with miopen"<<std::endl;
ORT_RETURN_IF_ERROR
(
UpdateState
(
context
));
if
(
s_
.
Y
->
Shape
().
Size
()
==
0
)
{
return
Status
::
OK
();
}
...
...
onnxruntime/core/providers/rocm/nn/conv.h
View file @
25d7fde8
...
...
@@ -191,6 +191,7 @@ class Conv : public RocmKernel {
}
Status
UpdateState
(
OpKernelContext
*
context
,
bool
bias_expected
=
false
)
const
;
Status
conv_im2col
(
OpKernelContext
*
context
,
bool
bias_expected
=
false
)
const
;
ConvAttributes
conv_attrs_
;
mutable
MiopenConvState
<
miopenConvAlgoPerf_t
>
s_
;
constexpr
static
auto
kDefaultConvAlgo
=
miopenConvolutionFwdAlgoGEMM
;
...
...
onnxruntime/core/providers/rocm/nn/conv_transpose.cc
View file @
25d7fde8
...
...
@@ -2,6 +2,13 @@
// Licensed under the MIT License.
#include "conv_transpose.h"
#include "core/providers/rocm/nn/im2col.cuh"
#include "core/providers/rocm/nn/ort_sugon.cuh"
#include <iostream>
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wunused-variable"
namespace
onnxruntime
{
namespace
rocm
{
...
...
@@ -36,6 +43,116 @@ Status ConvTranspose<T>::ComputeInternal(OpKernelContext* context) const {
return
DoConvTranspose
(
context
,
false
);
}
template
<
typename
T
>
Status
ConvTranspose
<
T
>::
ConvTranspose_col2im
(
OpKernelContext
*
context
,
bool
dynamic_padding
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
size_t
num_inputs
=
OpKernel
::
Node
().
InputDefs
().
size
();
ConvTransposeAttributes
::
Prepare
p
;
bool
has_bias
=
dynamic_padding
?
num_inputs
==
4
:
num_inputs
==
3
;
ORT_RETURN_IF_ERROR
(
conv_transpose_attrs_
.
PrepareForCompute
(
context
,
has_bias
,
p
,
dynamic_padding
,
transposed_filter_
?
&
filter_shape_
:
nullptr
));
// Bail out early if one of the dimensions is zero.
if
(
p
.
Y
->
Shape
().
Size
()
==
0
)
{
return
Status
::
OK
();
}
const
int64_t
input_image_size
=
p
.
input_shape
.
Size
();
const
int64_t
X_offset
=
p
.
num_input_channels
/
conv_transpose_attrs_
.
group
*
input_image_size
;
const
int64_t
Y_offset
=
p
.
Y
->
Shape
().
Size
()
/
p
.
Y
->
Shape
()[
0
]
/
conv_transpose_attrs_
.
group
;
const
int64_t
W_offset
=
p
.
F
->
Shape
().
Size
()
/
conv_transpose_attrs_
.
group
;
const
int64_t
kernel_size
=
TensorShape
(
p
.
kernel_shape
).
Size
();
const
int64_t
kernel_dim
=
p
.
num_output_channels
/
conv_transpose_attrs_
.
group
*
kernel_size
;
const
int64_t
output_size
=
(
p
.
Y
->
Shape
().
Slice
(
2
)).
Size
();
AllocatorPtr
alloc
;
ORT_RETURN_IF_ERROR
(
context
->
GetTempSpaceAllocator
(
&
alloc
));
const
int64_t
col_buffer_size
=
kernel_dim
*
p
.
input_shape
.
Size
();
//*conv_transpose_attrs_.group ;
auto
col_data
=
alloc
->
Alloc
(
SafeInt
<
size_t
>
(
sizeof
(
float
))
*
col_buffer_size
);
BufferUniquePtr
col_buffer
(
col_data
,
BufferDeleter
(
std
::
move
(
alloc
)));
auto
*
col_buffer_data
=
reinterpret_cast
<
float
*>
(
col_buffer
.
get
());
const
auto
*
Xdata
=
reinterpret_cast
<
const
HipT
*>
(
p
.
X
->
Data
<
T
>
());
const
auto
*
filter_data
=
p
.
F
?
reinterpret_cast
<
const
HipT
*>
(
p
.
F
->
Data
<
T
>
())
:
reinterpret_cast
<
HipT
*>
(
transposed_filter_
.
get
());
auto
*
Ydata
=
reinterpret_cast
<
HipT
*>
(
p
.
Y
->
MutableData
<
T
>
());
TensorShape
output_shape
=
p
.
Y
->
Shape
().
Slice
(
2
);
const
HipT
zero
=
ToHipType
<
T
>::
FromFloat
(
0.
f
);
const
HipT
one
=
ToHipType
<
T
>::
FromFloat
(
1.
f
);
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
if
(
has_bias
)
{
const
auto
*
Bdata
=
reinterpret_cast
<
const
HipT
*>
(
p
.
B
->
Data
<
T
>
());
const
auto
&
b_shape
=
p
.
B
->
Shape
();
ORT_RETURN_IF_NOT
(
b_shape
.
NumDimensions
()
==
1
,
"bias should be 1D"
);
assign_bias_gpu
<
HipT
>
(
Stream
(),
Ydata
,
Bdata
,
p
.
N
,
p
.
num_output_channels
,
output_size
);
}
else
{
assign_val_gpu
<
HipT
>
(
Stream
(),
Ydata
,
zero
,
p
.
N
,
p
.
num_output_channels
,
output_size
);
}
for
(
auto
image_id
=
0
;
image_id
<
p
.
N
;
++
image_id
)
{
// auto* a= filter_data ;//[C_in/G,C_out/G,k_h,h_w]--->展开[C_in/G, C_out/G*k_h*h_w]
// auto* b= Xdata + (image_id*conv_transpose_attrs_.group) * X_offset;//[1,C_in/G,H,W]--->展开[C_in/G, H*W]
// auto* y= Ydata + (image_id*conv_transpose_attrs_.group) * Y_offset;
// int stride_A = p.num_input_channels / conv_transpose_attrs_.group*kernel_dim;
// int stride_B = input_image_size*kernel_dim;
// int stride_C = p.num_input_channels / conv_transpose_attrs_.group*input_image_size;
// ROCBLAS_RETURN_IF_ERROR(rocblasGemmStridedBatchedHelper(
// RocblasHandle(),
// rocblas_operation_none,
// p.F ? rocblas_operation_transpose:rocblas_operation_none,//rocblas_operation_transpose
// static_cast<int>(input_image_size),static_cast<int>(kernel_dim),static_cast<int>(p.num_input_channels / conv_transpose_attrs_.group),
// &alpha,
// b, static_cast<int>(input_image_size),stride_B, //x
// a, static_cast<int>(kernel_dim),stride_A, //w
// &beta,
// col_buffer_data, static_cast<int>(input_image_size),stride_C,conv_transpose_attrs_.group));
for
(
int
group_id
=
0
;
group_id
<
conv_transpose_attrs_
.
group
;
++
group_id
)
{
//gemm
auto
*
a
=
filter_data
+
group_id
*
W_offset
;
//[C_in/G,C_out/G,k_h,h_w]--->展开[C_in/G, C_out/G*k_h*h_w]
auto
*
b
=
Xdata
+
(
image_id
*
conv_transpose_attrs_
.
group
+
group_id
)
*
X_offset
;
//[1,C_in/G,H,W]--->展开[C_in/G, H*W]
auto
*
y
=
Ydata
+
(
image_id
*
conv_transpose_attrs_
.
group
+
group_id
)
*
Y_offset
;
ROCBLAS_RETURN_IF_ERROR
(
rocblasGemmHelper
(
RocblasHandle
(),
rocblas_operation_none
,
p
.
F
?
rocblas_operation_transpose
:
rocblas_operation_none
,
//rocblas_operation_transpose
static_cast
<
int
>
(
input_image_size
),
static_cast
<
int
>
(
kernel_dim
),
static_cast
<
int
>
(
p
.
num_input_channels
/
conv_transpose_attrs_
.
group
),
&
one
,
b
,
static_cast
<
int
>
(
input_image_size
),
//x
a
,
static_cast
<
int
>
(
kernel_dim
),
//w
&
zero
,
col_buffer_data
,
static_cast
<
int
>
(
input_image_size
)));
if
(
p
.
X
->
Shape
().
NumDimensions
()
==
4
){
col2im_gpu
<
HipT
>
(
Stream
(),
col_buffer_data
,
p
.
num_output_channels
/
conv_transpose_attrs_
.
group
,
p
.
Y
->
Shape
()[
2
],
p
.
Y
->
Shape
()[
3
],
p
.
kernel_shape
[
0
],
p
.
kernel_shape
[
1
],
p
.
strides
[
0
],
p
.
strides
[
1
],
p
.
pads
[
0
],
p
.
pads
[
1
],
p
.
pads
[
2
],
p
.
pads
[
3
],
p
.
dilations
[
0
],
p
.
dilations
[
1
],
y
);
}
else
if
(
p
.
X
->
Shape
().
NumDimensions
()
==
3
)
{
col2im_gpu
<
HipT
>
(
Stream
(),
col_buffer_data
,
p
.
num_output_channels
/
conv_transpose_attrs_
.
group
,
1
,
p
.
Y
->
Shape
()[
2
],
1
,
p
.
kernel_shape
[
0
],
1
,
p
.
strides
[
0
],
0
,
p
.
pads
[
0
],
0
,
p
.
pads
[
1
],
1
,
p
.
dilations
[
0
],
y
);
}
}
}
return
Status
::
OK
();
}
template
<
typename
T
>
Status
ConvTranspose
<
T
>::
DoConvTranspose
(
OpKernelContext
*
context
,
bool
dynamic_padding
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
...
...
@@ -43,14 +160,23 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
x_shape
=
X
->
Shape
();
auto
x_dims
=
x_shape
.
AsShapeVector
();
auto
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
auto
x_dimensions
=
X
->
Shape
().
NumDimensions
();
if
(
x_dimensions
<
3
||
x_dimensions
>
5
)
{
// TODO: the error message should tell which operator raises it.
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Input X must be 3-, 4- or 5-dimensional."
,
" X: "
,
X
->
Shape
().
ToString
().
c_str
());
}
if
(
x_dimensions
==
4
||
x_dimensions
==
3
)
{
ORT_RETURN_IF_ERROR
(
ConvTranspose_col2im
(
context
,
dynamic_padding
));
return
Status
::
OK
();
}
auto
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
const
Tensor
*
W
=
context
->
Input
<
Tensor
>
(
1
);
const
TensorShape
&
w_shape
=
W
->
Shape
();
auto
w_dims
=
w_shape
.
AsShapeVector
();
...
...
@@ -181,8 +307,8 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
miopenConvolutionBackwardData
(
MiopenHandle
(),
&
alpha
,
s_
.
x_tensor
,
x_data
,
s_
.
x_tensor
,
x_data
,
s_
.
w_desc
,
w_data
,
s_
.
conv_desc
,
...
...
Prev
1
…
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment