Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2138 additions
and
0 deletions
+2138
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.cu
...core/providers/rocm/math/variadic_elementwise_ops_impl.cu
+154
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.h
.../core/providers/rocm/math/variadic_elementwise_ops_impl.h
+40
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_tags.h
.../core/providers/rocm/math/variadic_elementwise_ops_tags.h
+14
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/multi_tensor/common.cuh
...u/onnxruntime/core/providers/rocm/multi_tensor/common.cuh
+151
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.cc
...e/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.cc
+187
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.h
...se/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.h
+57
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.cc
...ease/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.cc
+132
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.h
...lease/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.h
+31
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.cu
...amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.cu
+191
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.h
.../amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.h
+17
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.cc
...mdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.cc
+310
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.h
...amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.cu
.../onnxruntime/core/providers/rocm/nn/instance_norm_impl.cu
+61
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.h
...u/onnxruntime/core/providers/rocm/nn/instance_norm_impl.h
+25
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.cc
...e/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.cc
+121
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.h
...se/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.h
+27
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.cu
...gpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.cu
+405
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.h
...dgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.h
+46
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.cc
.../Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.cc
+112
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.h
...x/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.h
+34
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.cu
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/math/variadic_elementwise_ops_impl.h"
#include "core/providers/rocm/cu_inc/variadic_elementwise_impl.cuh"
#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
#include "core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh"
#include "core/providers/rocm/math/variadic_elementwise_ops_tags.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
typename
VariadicElementwiseOpTag
>
struct
VariadicElementwiseOpTraits
;
#define DEFINE_TRAITS(VariadicElementwiseOpTag, ImplName) \
template <typename T> \
struct VariadicElementwiseOpTraits<T, VariadicElementwiseOpTag> { \
using ScalarComputeFunctor = OP_##ImplName<T, T, T>; \
\
static void ComputeFn( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count) { \
Impl_##ImplName( \
stream, \
output_rank_or_simple_broadcast, \
lhs_padded_strides, \
lhs_data, \
rhs_padded_strides, \
rhs_data, \
fdm_output_strides, \
fdm_H, \
fdm_C, \
output_data, \
count); \
} \
};
DEFINE_TRAITS
(
variadic_elementwise_ops
::
Sum
,
Add
)
DEFINE_TRAITS
(
variadic_elementwise_ops
::
Min
,
Min
)
DEFINE_TRAITS
(
variadic_elementwise_ops
::
Max
,
Max
)
#undef DEFINE_TRAITS
template
<
typename
T
,
typename
VariadicElementwiseOpTag
>
void
Impl_General
(
hipStream_t
stream
,
int32_t
output_rank_or_simple_broadcast
,
const
TArray
<
int64_t
>*
lhs_padded_strides
,
const
T
*
lhs_data
,
const
TArray
<
int64_t
>*
rhs_padded_strides
,
const
T
*
rhs_data
,
const
TArray
<
fast_divmod
>*
fdm_output_strides
,
const
fast_divmod
&
fdm_H
,
const
fast_divmod
&
fdm_C
,
T
*
output_data
,
size_t
count
)
{
VariadicElementwiseOpTraits
<
T
,
VariadicElementwiseOpTag
>::
ComputeFn
(
stream
,
output_rank_or_simple_broadcast
,
lhs_padded_strides
,
lhs_data
,
rhs_padded_strides
,
rhs_data
,
fdm_output_strides
,
fdm_H
,
fdm_C
,
output_data
,
count
);
}
template
<
typename
T
,
typename
VariadicElementwiseOpTag
>
void
Impl_NoBroadcastInputBatch
(
hipStream_t
stream
,
InputBatchArray
<
T
>
input_data_batch
,
T
*
output_data
,
size_t
count
)
{
VariadicElementWiseNoBroadcastInputBatchImpl
<
T
,
typename
VariadicElementwiseOpTraits
<
T
,
VariadicElementwiseOpTag
>::
ScalarComputeFunctor
,
k_max_input_batch_size
>
(
stream
,
typename
VariadicElementwiseOpTraits
<
T
,
VariadicElementwiseOpTag
>::
ScalarComputeFunctor
{},
count
,
input_data_batch
,
output_data
);
}
#define SPECIALIZE_IMPL(T, VariadicElementwiseOpTag) \
template void Impl_General<T, VariadicElementwiseOpTag>( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count); \
\
template void Impl_NoBroadcastInputBatch<T, VariadicElementwiseOpTag>( \
hipStream_t stream, \
InputBatchArray<T> input_data_batch, \
T * output_data, \
size_t count);
// the postfix means the types supported by the op:
// B: uint8_t
// W: uint16_t
// U: uint32_t
// Z: uint64_t
// C: int8_t
// S: int16_t
// I: int32_t
// L: int64_t
// H: float16
// F: float
// D: double
// O: bool
#define SPECIALIZE_IMPL_HFD(VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(half, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(float, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(double, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(BFloat16, VariadicElementwiseOpTag)
#define SPECIALIZE_IMPL_UZILHFD(VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(uint32_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(uint64_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(int32_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL(int64_t, VariadicElementwiseOpTag) \
SPECIALIZE_IMPL_HFD(VariadicElementwiseOpTag)
SPECIALIZE_IMPL_HFD
(
variadic_elementwise_ops
::
Sum
)
SPECIALIZE_IMPL_UZILHFD
(
variadic_elementwise_ops
::
Min
)
SPECIALIZE_IMPL_UZILHFD
(
variadic_elementwise_ops
::
Max
)
#undef SPECIALIZE_IMPL_UZILHFD
#undef SPECIALIZE_IMPL_HFD
#undef SPECIALIZE_IMPL
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <cstdint>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
typename
VariadicElementwiseOpTag
>
void
Impl_General
(
hipStream_t
stream
,
int32_t
output_rank_or_simple_broadcast
,
const
TArray
<
int64_t
>*
lhs_padded_strides
,
const
T
*
lhs_data
,
const
TArray
<
int64_t
>*
rhs_padded_strides
,
const
T
*
rhs_data
,
const
TArray
<
fast_divmod
>*
fdm_output_strides
,
const
fast_divmod
&
fdm_H
,
const
fast_divmod
&
fdm_C
,
T
*
output_data
,
size_t
count
);
constexpr
int32_t
k_max_input_batch_size
=
8
;
template
<
typename
T
>
using
InputBatchArray
=
TArray
<
const
T
*
,
k_max_input_batch_size
>
;
template
<
typename
T
,
typename
VariadicElementwiseOpTag
>
void
Impl_NoBroadcastInputBatch
(
hipStream_t
stream
,
InputBatchArray
<
T
>
input_data_batch
,
T
*
output_data
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/variadic_elementwise_ops_tags.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace
onnxruntime
{
namespace
rocm
{
namespace
variadic_elementwise_ops
{
struct
Sum
{};
struct
Min
{};
struct
Max
{};
}
// namespace variadic_elementwise_ops
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/multi_tensor/common.cuh
0 → 100644
View file @
1a91fcc2
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#pragma once
#include <vector>
#include "core/common/common.h"
#include "core/common/gsl.h"
namespace
onnxruntime
{
namespace
rocm
{
// initial reference from:
// https://github.com/NVIDIA/apex/blob/5b71d3695bf39efcdcda9dff5be2f70314b8f091/csrc/multi_tensor_apply.cuh#L15
// further experiment to get the number below. The larger the better, but if too large, it won't fit into GPU stack.
constexpr
int
ACTUAL_TENSOR_GROUP_SIZE
[
8
]
=
{
1
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
constexpr
int
MAX_BLOCK_COUNTS
[
8
]
=
{
256
,
320
,
320
,
320
,
320
,
288
,
288
,
256
};
constexpr
int
MAX_TENSOR_GROUP_COUNTS
[
8
]
=
{
1
,
96
,
64
,
32
,
32
,
32
,
32
,
32
};
constexpr
int
MAX_BLOCK_THREAD_COUNTS
[
8
]
=
{
256
,
512
,
512
,
512
,
512
,
512
,
512
,
512
};
// TensorGroupSize is the number of parallel tensors. For element-wise
// operators such as Relu, it should be 1. For two-operand operators such as
// element-wise addition, it should be 2. The value 0 is reserved for implementing
// kernels to handle a single large tensor.
template
<
int
TensorGroupSize
>
struct
ChunkGroup
{
// Number of chunks in this ChunkGroup.
// It's the effective size of block_index_to_tensor_group_index and
// block_index_to_chunk_start_index.
// The i-th chunk starts at the block_index_to_chunk_start_index[i]-th
// element in the block_index_to_tensor_group_index[i]-th tensor.
int
chunk_count
=
0
;
// Max number of elements in each chunk in this ChunkGroup.
// It's an upper bound because chunks locating in the end of tensors
// are not always full. For example, if we split a 7-element vector into
// two 4-element chunks, the second chunk may contain only 3 actual values.
int
chunk_size
=
0
;
// blkIdx.x block processes chunks in block_index_to_tensor_group_index[blkIdx.x]-th
// tensor group. Each chunk starts from block_index_to_chunk_start_index[blkIdx.x]-th
// element until reaching the end of this chunk or the end of the whole tensor.
//
// Let i = block_index_to_tensor_group_index[blkIdx.x]
// n = tensor_sizes[i]
// b = block_index_to_chunk_start_index[blkIdx.x]
// e = min(b + chunk_size, n)
// The valid index range for blockIdx.x is defined by the following equation.
// b <= valid index < e
int
block_index_to_tensor_group_index
[
MAX_BLOCK_COUNTS
[
TensorGroupSize
]];
int
block_index_to_chunk_start_index
[
MAX_BLOCK_COUNTS
[
TensorGroupSize
]];
int
tensor_sizes
[
MAX_TENSOR_GROUP_COUNTS
[
TensorGroupSize
]];
// The addresses of tensors where the chunks are extracted from.
// 1. tensor_ptrs[0][i], ..., tensor_ptrs[TensorGroupSize-1][i] are
// the tensors' pointers in the i-th group.
// 2. All tensors in the i-th group have the same size, tensor_sizes[i].
void
*
tensor_ptrs
[
ACTUAL_TENSOR_GROUP_SIZE
[
TensorGroupSize
]][
MAX_TENSOR_GROUP_COUNTS
[
TensorGroupSize
]];
// Max number of GPU blocks to process the chunks in this chunk group.
const
static
int
max_block_count
=
MAX_BLOCK_COUNTS
[
TensorGroupSize
];
// Max number of tensor groups in this chunk group.
const
static
int
max_tensor_group_count
=
MAX_TENSOR_GROUP_COUNTS
[
TensorGroupSize
];
// The suggested number of threads to launch per GPU block.
const
static
int
thread_count_per_block
=
MAX_BLOCK_THREAD_COUNTS
[
TensorGroupSize
];
};
template
<
int
TensorGroupSize
>
int
compute_max_tensor_size_per_launch
(
int
element_count_per_thread
)
{
constexpr
int
block_count
=
ChunkGroup
<
TensorGroupSize
>::
max_block_count
;
constexpr
int
thread_count_per_block
=
ChunkGroup
<
TensorGroupSize
>::
thread_count_per_block
;
return
block_count
*
thread_count_per_block
*
element_count_per_thread
;
}
template
<
int
TensorGroupSize
,
typename
TMultiTensorFunctor
,
typename
...
TFunctorParams
>
void
launch_multi_tensor_functor
(
hipStream_t
stream
,
const
int
chunk_size
,
gsl
::
span
<
int
>
tensor_sizes
,
gsl
::
span
<
std
::
vector
<
void
*>>
grouped_tensor_pointers
,
TMultiTensorFunctor
multipleTensorKernel
,
TFunctorParams
&&
...
kernelParams
)
{
// Check if 32-bit integer is enough.
ORT_ENFORCE
(
tensor_sizes
.
size
()
>
0
);
ORT_ENFORCE
(
tensor_sizes
.
size
()
<
static_cast
<
size_t
>
(
INT_MAX
));
ORT_ENFORCE
(
grouped_tensor_pointers
.
size
()
>
0
);
ORT_ENFORCE
(
grouped_tensor_pointers
.
size
()
<
static_cast
<
size_t
>
(
INT_MAX
));
ORT_ENFORCE
(
chunk_size
>
0
);
// Number of groups, for example, the number of updated weight tensors in Lamb optimizer.
const
int
group_count
=
static_cast
<
int
>
(
grouped_tensor_pointers
.
size
());
// Tensor count per group.
const
int
group_size
=
static_cast
<
int
>
(
grouped_tensor_pointers
[
0
].
size
());
int
tensor_group_index
=
0
;
int
block_index
=
0
;
ORT_ENFORCE
(
grouped_tensor_pointers
.
size
()
==
tensor_sizes
.
size
());
ORT_ENFORCE
(
group_size
==
ACTUAL_TENSOR_GROUP_SIZE
[
TensorGroupSize
]);
for
(
int
i
=
0
;
i
<
group_count
;
++
i
)
{
ORT_ENFORCE
(
grouped_tensor_pointers
[
i
].
size
()
==
static_cast
<
size_t
>
(
group_size
));
}
// Handle multiple tensors per ROCM kernel call.
ChunkGroup
<
TensorGroupSize
>
chunk_group
;
for
(
int
i
=
0
;
i
<
group_count
;
++
i
)
{
// Add pointers to one group of tensors into chunk_group.
for
(
int
j
=
0
;
j
<
group_size
;
++
j
)
{
chunk_group
.
tensor_ptrs
[
j
][
tensor_group_index
]
=
grouped_tensor_pointers
[
i
][
j
];
}
// Assuming that all tensors' shapes are the same, we just record w's size.
chunk_group
.
tensor_sizes
[
tensor_group_index
]
=
tensor_sizes
[
i
];
chunk_group
.
chunk_size
=
chunk_size
;
const
int
chunk_count
=
(
tensor_sizes
[
i
]
+
chunk_size
-
1
)
/
chunk_size
;
// Process all chunks in this tensor group.
for
(
int
chunk_index
=
0
;
chunk_index
<
chunk_count
;
++
chunk_index
)
{
chunk_group
.
block_index_to_tensor_group_index
[
block_index
]
=
tensor_group_index
;
chunk_group
.
block_index_to_chunk_start_index
[
block_index
]
=
chunk_index
*
chunk_size
;
// After ++block_index, block_index becomes the count of chunks in chunk_group.
++
block_index
;
chunk_group
.
chunk_count
=
block_index
;
if
(
block_index
==
chunk_group
.
max_block_count
)
{
multipleTensorKernel
(
stream
,
chunk_group
,
std
::
forward
<
TFunctorParams
>
(
kernelParams
)...);
block_index
=
0
;
}
}
// After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group.
++
tensor_group_index
;
if
(
tensor_group_index
==
chunk_group
.
max_tensor_group_count
)
{
multipleTensorKernel
(
stream
,
chunk_group
,
std
::
forward
<
TFunctorParams
>
(
kernelParams
)...);
block_index
=
0
;
tensor_group_index
=
0
;
}
}
// This round of processing tensor group is finished.
// All the groups remain in chunk group should be processed right now.
if
(
block_index
!=
0
)
{
multipleTensorKernel
(
stream
,
chunk_group
,
std
::
forward
<
TFunctorParams
>
(
kernelParams
)...);
block_index
=
0
;
tensor_group_index
=
0
;
}
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "batch_norm.h"
#include "core/providers/common.h"
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/cpu/nn/batch_norm_helper.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
using
namespace
std
;
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
7, 8, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
9, 13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
14, 14, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("U", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
BatchNormalization, \
kOnnxDomain, \
15, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
BatchNorm<T>);
template
<
typename
T
>
Status
BatchNorm
<
T
>::
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
Tensor
*
X
=
p_op_kernel_context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
scale
=
p_op_kernel_context
->
Input
<
Tensor
>
(
1
);
const
Tensor
*
B
=
p_op_kernel_context
->
Input
<
Tensor
>
(
2
);
const
Tensor
*
mean
=
p_op_kernel_context
->
Input
<
Tensor
>
(
3
);
const
Tensor
*
var
=
p_op_kernel_context
->
Input
<
Tensor
>
(
4
);
ORT_RETURN_IF_ERROR
(
BatchNormHelper
::
ValidateInputs
(
X
,
scale
,
B
,
mean
,
var
,
spatial_
==
1
));
const
TensorShape
&
x_shape
=
X
->
Shape
();
const
TensorShape
&
channel_shape
=
mean
->
Shape
();
Tensor
*
Y
=
p_op_kernel_context
->
Output
(
0
,
x_shape
);
Tensor
*
running_mean
=
p_op_kernel_context
->
Output
(
1
,
channel_shape
);
Tensor
*
running_var
=
p_op_kernel_context
->
Output
(
2
,
channel_shape
);
Tensor
*
saved_mean
=
p_op_kernel_context
->
Output
(
3
,
channel_shape
);
Tensor
*
saved_var
=
p_op_kernel_context
->
Output
(
4
,
channel_shape
);
auto
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
auto
scale_data
=
reinterpret_cast
<
const
HipT
*>
(
scale
->
Data
<
T
>
());
auto
b_data
=
reinterpret_cast
<
const
HipT
*>
(
B
->
Data
<
T
>
());
auto
mean_data
=
reinterpret_cast
<
const
HipT
*>
(
mean
->
Data
<
T
>
());
auto
var_data
=
reinterpret_cast
<
const
HipT
*>
(
var
->
Data
<
T
>
());
auto
y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
const
auto
alpha
=
Consts
<
HipT
>::
One
;
const
auto
beta
=
Consts
<
HipT
>::
Zero
;
MiopenTensor
data_desc
;
vector
<
int64_t
>
new_dims
;
BatchNormHelper
::
NormalizeDims
(
x_shape
,
new_dims
);
ORT_RETURN_IF_ERROR
(
data_desc
.
Set
(
new_dims
,
MiopenTensor
::
GetDataType
<
HipT
>
()));
// For half data type, the alpha, beta, scale, B, mean, var need to be float type
if
(
X
->
IsDataType
<
MLFloat16
>
())
{
MiopenTensor
scale_desc
;
ORT_RETURN_IF_ERROR
(
scale_desc
.
Set
(
new_dims
,
MiopenTensor
::
GetDataType
<
float
>
()));
MiopenTensor
bn_tensor_desc
;
ORT_RETURN_IF_ERROR
(
bn_tensor_desc
.
Set
(
data_desc
,
miopen_batch_norm_mode_
));
// Convert the scale, B, mean, var to float
const
int64_t
C
=
x_shape
.
GetDims
()[
1
];
auto
f_scale
=
GetScratchBuffer
<
float
>
(
C
);
auto
f_B
=
GetScratchBuffer
<
float
>
(
C
);
auto
f_mean
=
GetScratchBuffer
<
float
>
(
C
);
auto
f_var
=
GetScratchBuffer
<
float
>
(
C
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
scale_data
,
f_scale
.
get
(),
C
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
b_data
,
f_B
.
get
(),
C
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
mean_data
,
f_mean
.
get
(),
C
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
var_data
,
f_var
.
get
(),
C
);
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardInferenceHelper
(
MiopenHandle
(),
miopen_batch_norm_mode_
,
&
alpha
,
&
beta
,
data_desc
,
x_data
,
data_desc
,
y_data
,
bn_tensor_desc
,
f_scale
.
get
(),
f_B
.
get
(),
f_mean
.
get
(),
f_var
.
get
(),
epsilon_
));
return
Status
::
OK
();
}
MiopenTensor
bn_tensor_desc
;
ORT_RETURN_IF_ERROR
(
bn_tensor_desc
.
Set
(
data_desc
,
miopen_batch_norm_mode_
));
// in BatchNorm Forward Training mode if all 5 outputs present
if
(
running_mean
&&
running_var
&&
saved_mean
&&
saved_var
)
{
auto
running_mean_data
=
reinterpret_cast
<
HipT
*>
(
running_mean
->
MutableData
<
T
>
());
auto
running_var_data
=
reinterpret_cast
<
HipT
*>
(
running_var
->
MutableData
<
T
>
());
auto
saved_mean_data
=
reinterpret_cast
<
HipT
*>
(
saved_mean
->
MutableData
<
T
>
());
auto
saved_inv_var_data
=
reinterpret_cast
<
HipT
*>
(
saved_var
->
MutableData
<
T
>
());
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardTrainingHelper
(
MiopenHandle
(),
miopen_batch_norm_mode_
,
&
alpha
,
&
beta
,
data_desc
,
x_data
,
data_desc
,
y_data
,
bn_tensor_desc
,
scale_data
,
b_data
,
momentum_
,
running_mean_data
,
running_var_data
,
epsilon_
,
saved_mean_data
,
saved_inv_var_data
));
// in BatchNorm Forward Inference mode if only Y output present
}
else
{
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardInferenceHelper
(
MiopenHandle
(),
miopen_batch_norm_mode_
,
&
alpha
,
&
beta
,
data_desc
,
x_data
,
data_desc
,
y_data
,
bn_tensor_desc
,
scale_data
,
b_data
,
mean_data
,
var_data
,
epsilon_
));
}
return
Status
::
OK
();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status BatchNorm<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE
(
float
)
SPECIALIZED_COMPUTE
(
double
)
SPECIALIZED_COMPUTE
(
MLFloat16
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/batch_norm.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
BatchNorm
final
:
public
RocmKernel
{
public:
BatchNorm
(
const
OpKernelInfo
&
op_kernel_info
)
:
RocmKernel
{
op_kernel_info
},
miopen_batch_norm_mode_
(
miopenBNSpatial
),
momentum_
(
0.9
)
{
float
tmp_epsilon
;
ORT_ENFORCE
(
op_kernel_info
.
GetAttr
<
float
>
(
"epsilon"
,
&
tmp_epsilon
).
IsOK
());
epsilon_
=
ClampMiopenBatchNormEpsilon
(
static_cast
<
double
>
(
tmp_epsilon
));
// spatial or not
int64_t
tmp_spatial
;
if
(
op_kernel_info
.
GetAttr
<
int64_t
>
(
"spatial"
,
&
tmp_spatial
).
IsOK
())
{
spatial_
=
tmp_spatial
;
}
if
(
spatial_
==
0
)
{
miopen_batch_norm_mode_
=
miopenBNPerActivation
;
}
float
tmp_momentum
;
if
(
op_kernel_info
.
GetAttr
<
float
>
(
"momentum"
,
&
tmp_momentum
).
IsOK
())
{
momentum_
=
static_cast
<
double
>
(
tmp_momentum
);
}
is_training_mode_
=
(
op_kernel_info
.
GetAttrOrDefault
<
int64_t
>
(
"training_mode"
,
0
)
==
1
);
const
auto
&
node
=
op_kernel_info
.
node
();
auto
opset
=
node
.
SinceVersion
();
// batch norm opset 14 (or higher) is not implemented for training mode
ORT_ENFORCE
(
!
(
is_training_mode_
&&
opset
>=
14
),
"Training mode does not support BN opset 14 (or higher) yet."
);
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
double
epsilon_
;
int64_t
spatial_
=
1
;
// default as per spec
miopenBatchNormMode_t
miopen_batch_norm_mode_
;
double
momentum_
;
bool
is_training_mode_
=
0
;
//default as per spec
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/nn/dropout.h"
#include "core/providers/rocm/nn/dropout_impl.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
{
template
<
typename
T
>
struct
GetRatioDataImpl
{
void
operator
()(
const
Tensor
*
ratio
,
float
&
ratio_data
)
const
{
ratio_data
=
static_cast
<
float
>
(
*
(
ratio
->
Data
<
T
>
()));
ORT_ENFORCE
(
ratio_data
>=
0.0
f
&&
ratio_data
<
1.0
f
,
"ratio_data is outside range [0, 1)"
);
}
};
template
<
typename
T
>
struct
DropoutComputeImpl
{
void
operator
()(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
int64_t
N
,
const
int64_t
mask_element_count
,
const
float
ratio_data
,
PhiloxGenerator
&
generator
,
const
Tensor
&
X
,
Tensor
&
Y
,
void
*
mask_data
,
bool
use_bitmask
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
HipT
*
X_data
=
reinterpret_cast
<
const
HipT
*>
(
X
.
Data
<
T
>
());
HipT
*
Y_data
=
reinterpret_cast
<
HipT
*>
(
Y
.
MutableData
<
T
>
());
DropoutKernelImpl
<
HipT
>
(
prop
,
stream
,
N
,
mask_element_count
,
ratio_data
,
generator
,
X_data
,
Y_data
,
mask_data
,
use_bitmask
);
}
};
}
// namespace
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Dropout
,
kOnnxDomain
,
12
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllIEEEFloatTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
AllIEEEFloatTensorTypes
())
.
TypeConstraint
(
"T2"
,
DataTypeImpl
::
GetTensorType
<
bool
>
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
2
),
Dropout
<
false
>
);
ONNX_OPERATOR_KERNEL_EX
(
Dropout
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
BuildKernelDefConstraints
<
MLFloat16
,
float
,
double
,
BFloat16
>
())
.
TypeConstraint
(
"T1"
,
BuildKernelDefConstraints
<
MLFloat16
,
float
,
double
,
BFloat16
>
())
.
TypeConstraint
(
"T2"
,
DataTypeImpl
::
GetTensorType
<
bool
>
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
2
),
Dropout
<
false
>
);
template
<
bool
UseBitmask
>
Status
Dropout
<
UseBitmask
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
// Get X_data
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
if
(
!
X
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"X Input is not available."
);
const
TensorShape
&
shape
=
X
->
Shape
();
const
int64_t
N
=
shape
.
Size
();
// Get Y_data
auto
Y
=
context
->
Output
(
0
,
shape
);
// Get mask_data
Tensor
*
mask
=
nullptr
;
int64_t
mask_element_count
=
N
;
if
(
UseBitmask
)
{
mask_element_count
=
(
N
+
kNumBitsPerBitmaskElement
-
1
)
/
kNumBitsPerBitmaskElement
;
mask
=
context
->
Output
(
1
,
{
mask_element_count
});
}
else
{
mask
=
context
->
Output
(
1
,
shape
);
}
ORT_ENFORCE
(
!
mask
||
mask
->
Shape
().
Size
()
==
mask_element_count
);
// Get the ratio_data
float
ratio_data
=
default_ratio_
;
auto
ratio
=
context
->
Input
<
Tensor
>
(
1
);
if
(
ratio
)
{
utils
::
MLTypeCallDispatcher
<
float
,
MLFloat16
,
double
,
BFloat16
>
t_disp
(
ratio
->
GetElementType
());
t_disp
.
Invoke
<
GetRatioDataImpl
>
(
ratio
,
ratio_data
);
}
const
Tensor
*
training_mode
=
context
->
Input
<
Tensor
>
(
2
);
// Check for inference mode.
if
(
ratio_data
==
0.
f
||
!
training_mode
||
!
(
*
(
training_mode
->
Data
<
bool
>
())))
{
const
void
*
X_data
=
X
->
DataRaw
();
void
*
Y_data
=
Y
->
MutableDataRaw
();
if
(
Y_data
!=
X_data
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
Y_data
,
X_data
,
X
->
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
}
// If mask is requested, return all 1s.
if
(
mask
)
{
if
(
UseBitmask
)
{
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
mask
->
MutableDataRaw
(),
-
1
,
mask_element_count
*
sizeof
(
BitmaskElementType
),
Stream
()));
}
else
{
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
mask
->
MutableData
<
bool
>
(),
true
,
mask_element_count
*
sizeof
(
bool
),
Stream
()));
}
}
return
Status
::
OK
();
}
IAllocatorUniquePtr
<
void
>
temp_mask_buffer
{};
// buffer to use if mask is not provided
void
*
const
mask_data
=
[
this
,
mask_element_count
,
mask
,
&
temp_mask_buffer
]()
{
if
(
mask
)
return
mask
->
MutableDataRaw
();
temp_mask_buffer
=
GetScratchBuffer
<
void
>
(
mask_element_count
*
(
UseBitmask
?
sizeof
(
BitmaskElementType
)
:
sizeof
(
bool
)));
return
temp_mask_buffer
.
get
();
}();
PhiloxGenerator
&
generator
=
generator_
?
*
generator_
:
PhiloxGenerator
::
Default
();
utils
::
MLTypeCallDispatcher
<
float
,
MLFloat16
,
double
,
BFloat16
>
t_disp
(
X
->
GetElementType
());
t_disp
.
Invoke
<
DropoutComputeImpl
>
(
GetDeviceProp
(),
Stream
(),
N
,
mask_element_count
,
ratio_data
,
generator
,
*
X
,
*
Y
,
mask_data
,
UseBitmask
);
return
Status
::
OK
();
}
// Instantiation for Dropout.
template
class
Dropout
<
false
>;
template
class
Dropout
<
true
>;
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/common.h"
#include "core/framework/random_generator.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
bool
UseBitmask
>
class
Dropout
final
:
public
RocmKernel
{
public:
Dropout
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
seed
=
0
;
if
(
info
.
GetAttr
<
int64_t
>
(
"seed"
,
&
seed
).
IsOK
())
{
generator_
=
std
::
make_unique
<
PhiloxGenerator
>
(
static_cast
<
uint64_t
>
(
seed
));
}
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
mutable
std
::
unique_ptr
<
PhiloxGenerator
>
generator_
;
static
constexpr
float
default_ratio_
=
0.5
f
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Modifications Copyright (c) Microsoft. */
#include "core/providers/rocm/nn/dropout_impl.h"
#include <hiprand_kernel.h>
#include <algorithm>
#include "core/providers/rocm/cu_inc/bitmask.cuh"
namespace
onnxruntime
{
namespace
rocm
{
constexpr
int
kBlockSize
=
256
;
constexpr
int
kNumUnroll
=
4
;
template
<
typename
T
,
bool
UseBitmask
>
__global__
void
DropoutKernel
(
const
HIP_LONG
N
,
const
HIP_LONG
mask_element_count
,
const
int
step_size
,
const
int
steps_per_thread
,
const
fast_divmod
fdm_bits_per_element
,
const
float
ratio
,
const
std
::
pair
<
uint64_t
,
uint64_t
>
seeds
,
const
T
*
X_data
,
T
*
Y_data
,
void
*
mask_data
)
{
HIP_LONG
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
float
p
=
1.0
f
-
ratio
;
const
float
scale
=
1.0
f
/
p
;
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seeds
.
first
,
idx
,
seeds
.
second
,
&
state
);
float4
rand
;
// We ensure every thread generates the same number of random numbers (by rounding
// up the size) and at the same timestep (by syncing threads).
// From ROCM hiprand documentation:
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
// Each thread computes 4 random numbers in the same time thus the most efficient
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
for
(
int
i
=
0
;
i
<
steps_per_thread
;
++
i
)
{
HIP_LONG
id
=
idx
*
kNumUnroll
+
i
*
step_size
;
rand
=
hiprand_uniform4
(
&
state
);
BitmaskElementType
thread_bitmask
=
0
;
// actual computation
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumUnroll
;
++
i
)
{
HIP_LONG
li
=
id
+
i
;
if
(
li
<
N
)
{
bool
mask
=
(
&
rand
.
x
)[
i
]
<
p
;
Y_data
[
li
]
=
static_cast
<
T
>
(
static_cast
<
float
>
(
X_data
[
li
])
*
mask
*
scale
);
if
(
UseBitmask
)
{
thread_bitmask
|=
(
mask
<<
i
);
}
else
{
reinterpret_cast
<
bool
*>
(
mask_data
)[
li
]
=
mask
;
}
}
}
if
(
UseBitmask
)
{
SetBitmask
<
kNumUnroll
>
(
id
,
mask_element_count
,
fdm_bits_per_element
,
thread_bitmask
,
reinterpret_cast
<
BitmaskElementType
*>
(
mask_data
));
}
__syncthreads
();
}
}
template
<
typename
T
,
bool
UseBitmask
>
__global__
void
DropoutVectorizedKernel
(
const
HIP_LONG
N
,
const
HIP_LONG
mask_element_count
,
const
int
step_size
,
const
int
steps_per_thread
,
const
fast_divmod
fdm_bits_per_element
,
const
float
ratio
,
const
std
::
pair
<
uint64_t
,
uint64_t
>
seeds
,
const
T
*
X_data
,
T
*
Y_data
,
void
*
mask_data
)
{
HIP_LONG
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
float
p
=
1.0
f
-
ratio
;
const
float
scale
=
1.0
f
/
p
;
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seeds
.
first
,
idx
,
seeds
.
second
,
&
state
);
float4
rand
;
// using vectorized data load/store approach when N % 4 == 0 since this is
// typical case for input shape size
using
LoadT
=
aligned_vector
<
T
,
kNumUnroll
>
;
using
MaskLoadT
=
aligned_vector
<
bool
,
kNumUnroll
>
;
for
(
int
i
=
0
;
i
<
steps_per_thread
;
++
i
)
{
HIP_LONG
id
=
idx
*
kNumUnroll
+
i
*
step_size
;
rand
=
hiprand_uniform4
(
&
state
);
BitmaskElementType
thread_bitmask
=
0
;
if
(
id
<
N
)
{
// vectorized load into storage
T
src
[
kNumUnroll
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
src
);
*
value
=
*
reinterpret_cast
<
const
LoadT
*>
(
&
X_data
[
id
]);
T
r
[
kNumUnroll
];
bool
masks
[
kNumUnroll
];
// actual computation
#pragma unroll
for
(
int
ii
=
0
;
ii
<
kNumUnroll
;
++
ii
)
{
bool
mask
=
(
&
rand
.
x
)[
ii
]
<
p
;
r
[
ii
]
=
static_cast
<
T
>
(
static_cast
<
float
>
(
src
[
ii
])
*
mask
*
scale
);
if
(
UseBitmask
)
{
thread_bitmask
|=
(
mask
<<
ii
);
}
else
{
masks
[
ii
]
=
mask
;
}
}
// Vectorized writes for mask_data & Y_data
*
(
reinterpret_cast
<
LoadT
*>
(
&
Y_data
[
id
]))
=
*
reinterpret_cast
<
LoadT
*>
(
&
r
[
0
]);
if
(
!
UseBitmask
)
{
*
(
reinterpret_cast
<
MaskLoadT
*>
(
&
reinterpret_cast
<
bool
*>
(
mask_data
)[
id
]))
=
*
reinterpret_cast
<
MaskLoadT
*>
(
&
masks
[
0
]);
}
}
if
(
UseBitmask
)
{
SetBitmask
<
kNumUnroll
>
(
id
,
mask_element_count
,
fdm_bits_per_element
,
thread_bitmask
,
reinterpret_cast
<
BitmaskElementType
*>
(
mask_data
));
}
__syncthreads
();
}
}
#define LAUNCH_DROPOUT_KERNEL(FuncName, UseBitmask) \
hipLaunchKernelGGL(HIP_KERNEL_NAME(FuncName<T, UseBitmask>), grid_size, kBlockSize, 0, stream, \
static_cast<HIP_LONG>(N), static_cast<HIP_LONG>(mask_element_count), step_size, steps_per_thread, \
fdm_bits_per_element, ratio, seeds, X_data, Y_data, mask_data)
#define HANDLE_DROPOUT_USE_BITMASK(FuncName) \
if (use_bitmask) { \
LAUNCH_DROPOUT_KERNEL(FuncName, true); \
} else { \
LAUNCH_DROPOUT_KERNEL(FuncName, false); \
}
template
<
typename
T
>
void
DropoutKernelImpl
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
int64_t
N
,
const
int64_t
mask_element_count
,
const
float
ratio
,
PhiloxGenerator
&
generator
,
const
T
*
X_data
,
T
*
Y_data
,
void
*
mask_data
,
bool
use_bitmask
)
{
const
int
blocks_per_sm
=
prop
.
maxThreadsPerMultiProcessor
/
kBlockSize
;
const
int
grid_size
=
std
::
min
(
prop
.
multiProcessorCount
*
blocks_per_sm
,
static_cast
<
int
>
(
CeilDiv
(
N
,
kBlockSize
*
kNumUnroll
)));
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that
// amount.
const
int
step_size
=
kBlockSize
*
grid_size
*
kNumUnroll
;
const
int
steps_per_thread
=
static_cast
<
int
>
(
CeilDiv
(
N
,
step_size
));
auto
seeds
=
generator
.
NextPhiloxSeeds
(
static_cast
<
uint64_t
>
(
steps_per_thread
*
kNumUnroll
));
fast_divmod
fdm_bits_per_element
(
kNumBitsPerBitmaskElement
);
if
(
N
%
kNumUnroll
!=
0
)
{
HANDLE_DROPOUT_USE_BITMASK
(
DropoutKernel
);
}
else
{
HANDLE_DROPOUT_USE_BITMASK
(
DropoutVectorizedKernel
);
}
}
#undef HANDLE_DROPOUT_USE_BITMASK
#undef LAUNCH_DROPOUT_KERNEL
#define SPECIALIZED_DROPOUT_IMPL(T) \
template void DropoutKernelImpl<T>(const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N, \
const int64_t mask_element_count, const float ratio, PhiloxGenerator& generator, \
const T* X_data, T* Y_data, void* mask_data, bool use_bitmask);
SPECIALIZED_DROPOUT_IMPL
(
float
)
SPECIALIZED_DROPOUT_IMPL
(
double
)
SPECIALIZED_DROPOUT_IMPL
(
half
)
SPECIALIZED_DROPOUT_IMPL
(
BFloat16
)
#undef SPECIALIZED_DROPOUT_IMPL
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/dropout_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/framework/random_generator.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
DropoutKernelImpl
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
int64_t
N
,
const
int64_t
mask_element_count
,
const
float
ratio
,
PhiloxGenerator
&
generator
,
const
T
*
X_data
,
T
*
Y_data
,
void
*
mask_data
,
bool
use_bitmask
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "instance_norm.h"
#include "instance_norm_impl.h"
#include "core/providers/cpu/nn/instance_norm_helper.h"
#include "core/providers/cpu/nn/batch_norm_helper.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
InstanceNormalization, \
kOnnxDomain, \
6, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
InstanceNorm<T>);
REGISTER_KERNEL_TYPED
(
float
)
REGISTER_KERNEL_TYPED
(
double
)
REGISTER_KERNEL_TYPED
(
MLFloat16
)
template
<
typename
T
>
InstanceNorm
<
T
>::
InstanceNorm
(
const
OpKernelInfo
&
op_kernel_info
)
:
RocmKernel
(
op_kernel_info
)
{
float
tmp_epsilon
;
ORT_ENFORCE
(
op_kernel_info
.
GetAttr
<
float
>
(
"epsilon"
,
&
tmp_epsilon
).
IsOK
());
epsilon_
=
ClampMiopenBatchNormEpsilon
(
tmp_epsilon
);
}
template
<
typename
T
>
Status
InstanceNorm
<
T
>::
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
Tensor
*
X
=
p_op_kernel_context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
scale
=
p_op_kernel_context
->
Input
<
Tensor
>
(
1
);
const
Tensor
*
bias
=
p_op_kernel_context
->
Input
<
Tensor
>
(
2
);
ORT_RETURN_IF_ERROR
(
InstanceNormHelper
::
ValidateInputs
(
X
,
scale
,
bias
));
const
TensorShape
&
x_shape
=
X
->
Shape
();
Tensor
*
Y
=
p_op_kernel_context
->
Output
(
0
,
x_shape
);
auto
*
y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
const
auto
*
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
const
auto
*
scale_data
=
reinterpret_cast
<
const
HipT
*>
(
scale
->
Data
<
T
>
());
const
auto
*
bias_data
=
reinterpret_cast
<
const
HipT
*>
(
bias
->
Data
<
T
>
());
const
auto
&
x_dims
=
x_shape
.
GetDims
();
const
int64_t
N
=
x_dims
[
0
];
const
int64_t
C
=
x_dims
[
1
];
const
auto
one
=
Consts
<
HipT
>::
One
;
const
auto
zero
=
Consts
<
HipT
>::
Zero
;
if
(
N
==
1
)
{
// when N == 1, we can treat it as spatial batch normalization in training
// as the mean/variance would be computed from input
MiopenTensor
data_desc
;
std
::
vector
<
int64_t
>
new_dims
;
BatchNormHelper
::
NormalizeDims
(
x_shape
,
new_dims
);
ORT_RETURN_IF_ERROR
(
data_desc
.
Set
(
new_dims
,
MiopenTensor
::
GetDataType
<
HipT
>
()));
MiopenTensor
stats_desc
;
ORT_RETURN_IF_ERROR
(
stats_desc
.
Set
(
data_desc
,
miopenBNSpatial
));
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardTrainingHelper
(
MiopenHandle
(),
miopenBNSpatial
,
&
one
,
&
zero
,
data_desc
,
x_data
,
data_desc
,
y_data
,
stats_desc
,
scale_data
,
bias_data
,
1.0
f
,
nullptr
,
nullptr
,
epsilon_
,
nullptr
,
nullptr
));
}
else
{
// we use miopenBatchNormalizationForwardTraining to compute mean/variance
// so collapsing NC into channel
auto
input_count
=
x_shape
.
Size
();
// N * C * H * W
auto
stats_count
=
x_shape
.
SizeToDimension
(
2
);
// N * C
auto
image_size
=
input_count
/
stats_count
;
MiopenTensor
data_desc
;
ORT_RETURN_IF_ERROR
(
data_desc
.
Set
(
std
::
array
<
int64_t
,
4
>
{
1
,
stats_count
,
image_size
,
1
},
MiopenTensor
::
GetDataType
<
HipT
>
()));
MiopenTensor
stats_desc
;
ORT_RETURN_IF_ERROR
(
stats_desc
.
Set
(
std
::
array
<
int64_t
,
4
>
{
1
,
stats_count
,
1
,
1
},
MiopenTensor
::
GetDataType
<
HipT
>
()));
const
size_t
stats_byte_count
=
stats_count
*
sizeof
(
HipT
);
// Mean & Variance are inputs & outputs and must be initialized to zero to work properly
auto
mean
=
GetScratchBuffer
<
HipT
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
mean
.
get
(),
0
,
stats_byte_count
,
Stream
()));
auto
variance
=
GetScratchBuffer
<
HipT
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
variance
.
get
(),
0
,
stats_byte_count
,
Stream
()));
// We must set the scale & bias inputs to zero as they are inputs to the calculation
auto
unused_scale
=
GetScratchBuffer
<
HipT
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
unused_scale
.
get
(),
0
,
stats_byte_count
,
Stream
()));
auto
unused_bias
=
GetScratchBuffer
<
HipT
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
unused_bias
.
get
(),
0
,
stats_byte_count
,
Stream
()));
// first, compute mean and variance per-instance per-channel using miopenBatchNorm training
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardTrainingHelper
(
MiopenHandle
(),
miopenBNSpatial
,
&
one
,
&
zero
,
data_desc
,
x_data
,
data_desc
,
y_data
,
// use y temporarily, would be rewritten later
stats_desc
,
unused_scale
.
get
(),
unused_bias
.
get
(),
1.0
f
,
mean
.
get
(),
variance
.
get
(),
MIOPEN_BN_MIN_EPSILON
,
nullptr
,
nullptr
));
// Y = scale * (x - mean) / sqrt (variance + epsilon) + B
// X/Y is (N,C,H,W)
// scale/bias is (1,C,1,1)
// mean/stddev is (N,C,1,1)
// NOTE miopenBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
// and it needs to be corrected with (count - 1) / count
fast_divmod
fdm_HW
(
gsl
::
narrow_cast
<
int
>
(
image_size
));
fast_divmod
fdm_C
(
gsl
::
narrow_cast
<
int
>
(
C
));
InstanceNormImpl
<
HipT
>
(
Stream
(),
x_data
,
scale_data
,
bias_data
,
mean
.
get
(),
variance
.
get
(),
(
image_size
-
1.0
)
/
image_size
,
static_cast
<
double
>
(
epsilon_
),
fdm_HW
,
fdm_C
,
y_data
,
input_count
);
}
return
Status
::
OK
();
}
template
<
>
Status
InstanceNorm
<
MLFloat16
>::
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
{
typedef
typename
ToHipType
<
MLFloat16
>::
MappedType
HipT
;
const
Tensor
*
X
=
p_op_kernel_context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
scale
=
p_op_kernel_context
->
Input
<
Tensor
>
(
1
);
const
Tensor
*
bias
=
p_op_kernel_context
->
Input
<
Tensor
>
(
2
);
ORT_RETURN_IF_ERROR
(
InstanceNormHelper
::
ValidateInputs
(
X
,
scale
,
bias
));
const
TensorShape
&
x_shape
=
X
->
Shape
();
Tensor
*
Y
=
p_op_kernel_context
->
Output
(
0
,
x_shape
);
auto
*
y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
MLFloat16
>
());
const
auto
*
x_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
MLFloat16
>
());
const
auto
*
scale_data
=
reinterpret_cast
<
const
HipT
*>
(
scale
->
Data
<
MLFloat16
>
());
const
auto
*
bias_data
=
reinterpret_cast
<
const
HipT
*>
(
bias
->
Data
<
MLFloat16
>
());
const
auto
&
x_dims
=
x_shape
.
GetDims
();
const
int64_t
N
=
x_dims
[
0
];
const
int64_t
C
=
x_dims
[
1
];
const
auto
one
=
Consts
<
HipT
>::
One
;
const
auto
zero
=
Consts
<
HipT
>::
Zero
;
if
(
N
==
1
)
{
// when N == 1, we can treat it as spatial batch normalization in training
// as the mean/variance would be computed from input
MiopenTensor
data_desc
;
std
::
vector
<
int64_t
>
new_dims
;
BatchNormHelper
::
NormalizeDims
(
x_shape
,
new_dims
);
ORT_RETURN_IF_ERROR
(
data_desc
.
Set
(
new_dims
,
MiopenTensor
::
GetDataType
<
HipT
>
()));
MiopenTensor
stats_desc
;
ORT_RETURN_IF_ERROR
(
stats_desc
.
Set
(
data_desc
,
miopenBNSpatial
));
// For half input data type, alpha, beta, scale, bias need to be float type.
// alpha, beta will be of type float as the Consts struct specialization
// for MLFloat16 type take care of that. Only Convert the scale, bias to float)
auto
scale_data_fp32
=
GetScratchBuffer
<
float
>
(
C
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
scale_data
,
scale_data_fp32
.
get
(),
C
);
auto
bias_data_fp32
=
GetScratchBuffer
<
float
>
(
C
);
Impl_Cast
<
HipT
,
float
>
(
Stream
(),
bias_data
,
bias_data_fp32
.
get
(),
C
);
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardTrainingHelper
(
MiopenHandle
(),
miopenBNSpatial
,
&
one
,
&
zero
,
data_desc
,
x_data
,
data_desc
,
y_data
,
stats_desc
,
scale_data_fp32
.
get
(),
bias_data_fp32
.
get
(),
1.0
f
,
nullptr
,
nullptr
,
epsilon_
,
nullptr
,
nullptr
));
}
else
{
// we use miopenBatchNormalizationForwardTraining to compute mean/variance
// so collapsing NC into channel
auto
input_count
=
x_shape
.
Size
();
// N * C * H * W
auto
stats_count
=
x_shape
.
SizeToDimension
(
2
);
// N * C
auto
image_size
=
input_count
/
stats_count
;
MiopenTensor
data_desc
;
ORT_RETURN_IF_ERROR
(
data_desc
.
Set
(
std
::
array
<
int64_t
,
4
>
{
1
,
stats_count
,
image_size
,
1
},
MiopenTensor
::
GetDataType
<
HipT
>
()));
// stats_desc needs to be of 'float' type even for float16 input as the "stats" are of float type
MiopenTensor
stats_desc
;
ORT_RETURN_IF_ERROR
(
stats_desc
.
Set
(
std
::
array
<
int64_t
,
4
>
{
1
,
stats_count
,
1
,
1
},
MiopenTensor
::
GetDataType
<
float
>
()));
// For half input data type, we need to allocate some "intermediate"
// float buffers for CuDNN to use.
const
size_t
stats_byte_count
=
stats_count
*
sizeof
(
float
);
// Mean & Variance are inputs & outputs and must be initialized to zero to work properly
auto
mean
=
GetScratchBuffer
<
float
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
mean
.
get
(),
0
,
stats_byte_count
,
Stream
()));
auto
variance
=
GetScratchBuffer
<
float
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
variance
.
get
(),
0
,
stats_byte_count
,
Stream
()));
// We must set the scale & bias inputs to zero as they are inputs to the calculation
auto
unused_scale
=
GetScratchBuffer
<
float
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
unused_scale
.
get
(),
0
,
stats_byte_count
,
Stream
()));
auto
unused_bias
=
GetScratchBuffer
<
float
>
(
stats_count
);
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
unused_bias
.
get
(),
0
,
stats_byte_count
,
Stream
()));
// first, compute mean and variance per-instance per-channel using miopenBatchNorm training
MIOPEN_RETURN_IF_ERROR
(
BatchNormalizationForwardTrainingHelper
(
MiopenHandle
(),
miopenBNSpatial
,
&
one
,
&
zero
,
data_desc
,
x_data
,
data_desc
,
y_data
,
// use y temporarily, would be rewritten later
stats_desc
,
unused_scale
.
get
(),
unused_bias
.
get
(),
1.0
f
,
mean
.
get
(),
variance
.
get
(),
MIOPEN_BN_MIN_EPSILON
,
nullptr
,
nullptr
));
// Y = scale * (x - mean) / sqrt (variance + epsilon) + B
// X/Y is (N,C,H,W)
// scale/bias is (1,C,1,1)
// mean/stddev is (N,C,1,1)
// NOTE miopenBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
// and it needs to be corrected with (count - 1) / count
fast_divmod
fdm_HW
(
gsl
::
narrow_cast
<
int
>
(
image_size
));
fast_divmod
fdm_C
(
gsl
::
narrow_cast
<
int
>
(
C
));
// The InstanceNormImpl kernel handles the mean/variance in float32, so no casting required here
InstanceNormImpl
<
HipT
,
float
>
(
Stream
(),
x_data
,
scale_data
,
bias_data
,
mean
.
get
(),
variance
.
get
(),
(
image_size
-
1.0
)
/
image_size
,
static_cast
<
double
>
(
epsilon_
),
fdm_HW
,
fdm_C
,
y_data
,
input_count
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
InstanceNorm
final
:
public
RocmKernel
{
public:
InstanceNorm
(
const
OpKernelInfo
&
op_kernel_info
);
Status
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
override
;
private:
double
epsilon_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "instance_norm_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T1
,
typename
T2
>
__global__
void
_InstanceNormKernel
(
const
T1
*
__restrict__
input_data
,
const
T1
*
__restrict__
scale
,
const
T1
*
__restrict__
bias
,
const
T2
*
__restrict__
mean
,
const
T2
*
__restrict__
variance
,
const
double
variance_correction
,
const
double
epsilon
,
const
fast_divmod
fdm_HW
,
const
fast_divmod
fdm_C
,
T1
*
__restrict__
output_data
,
const
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
nc
=
fdm_HW
.
div
(
id
);
int
n
,
c
;
fdm_C
.
divmod
(
nc
,
n
,
c
);
// Y = scale * (x - mean) / sqrt (std * std + epsilon) + B
output_data
[
id
]
=
scale
[
c
]
*
(
input_data
[
id
]
-
(
T1
)
mean
[
nc
])
/
_Sqrt
((
T1
)
variance
[
nc
]
*
(
T1
)
variance_correction
+
(
T1
)
epsilon
)
+
bias
[
c
];
}
template
<
typename
T1
,
typename
T2
>
void
InstanceNormImpl
(
hipStream_t
stream
,
const
T1
*
input_data
,
const
T1
*
scale
,
const
T1
*
bias
,
const
T2
*
mean
,
const
T2
*
variance
,
const
double
variance_correction
,
const
double
epsilon
,
const
fast_divmod
&
fdm_HW
,
const
fast_divmod
&
fdm_C
,
T1
*
output_data
,
size_t
N
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_InstanceNormKernel
<
T1
,
T2
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_data
,
scale
,
bias
,
mean
,
variance
,
variance_correction
,
epsilon
,
fdm_HW
,
fdm_C
,
output_data
,
(
HIP_LONG
)
N
);
}
#define SPECIALIZED_IMPL(T1, T2) \
template void InstanceNormImpl<T1, T2>(hipStream_t stream, const T1* input_data, const T1* scale, const T1* bias, const T2* mean, const T2* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T1* output_data, size_t count);
SPECIALIZED_IMPL
(
float
,
float
)
SPECIALIZED_IMPL
(
double
,
double
)
// When the input data type is float16, the means and variances will flow in as float32 (special case)
SPECIALIZED_IMPL
(
half
,
float
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/instance_norm_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/fast_divmod.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T1
,
typename
T2
>
void
InstanceNormImpl
(
hipStream_t
stream
,
const
T1
*
input_data
,
const
T1
*
scale
,
const
T1
*
bias
,
const
T2
*
mean
,
const
T2
*
variance
,
const
double
variance_correction
,
const
double
epsilon
,
const
fast_divmod
&
fdm_HW
,
const
fast_divmod
&
fdm_C
,
T1
*
output_data
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/nn/layer_norm.h"
#include "core/providers/rocm/nn/layer_norm_impl.h"
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T, U) \
ONNX_OPERATOR_TYPED_KERNEL_EX(LayerNormalization, kOnnxDomain, 17, T, kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("U", DataTypeImpl::GetTensorType<U>()), \
LayerNorm<T, U, T, false>);
REGISTER_KERNEL_TYPED
(
float
,
float
)
REGISTER_KERNEL_TYPED
(
double
,
float
)
REGISTER_KERNEL_TYPED
(
MLFloat16
,
float
)
REGISTER_KERNEL_TYPED
(
BFloat16
,
float
)
template
<
typename
T
,
typename
U
,
typename
V
,
bool
simplified
>
LayerNorm
<
T
,
U
,
V
,
simplified
>::
LayerNorm
(
const
OpKernelInfo
&
op_kernel_info
)
:
RocmKernel
(
op_kernel_info
)
{
ORT_ENFORCE
(
op_kernel_info
.
GetAttr
(
"axis"
,
&
axis_
).
IsOK
());
float
tmp_epsilon
;
ORT_ENFORCE
(
op_kernel_info
.
GetAttr
<
float
>
(
"epsilon"
,
&
tmp_epsilon
).
IsOK
());
epsilon_
=
tmp_epsilon
;
}
template
<
typename
T
,
typename
U
,
typename
V
,
bool
simplified
>
Status
LayerNorm
<
T
,
U
,
V
,
simplified
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
typedef
typename
ToHipType
<
U
>::
MappedType
CudaU
;
typedef
typename
ToHipType
<
V
>::
MappedType
CudaV
;
// Inputs
const
Tensor
*
X
=
ctx
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
scale
=
ctx
->
Input
<
Tensor
>
(
1
);
const
Tensor
*
bias
=
ctx
->
Input
<
Tensor
>
(
2
);
auto
X_data
=
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
());
auto
scale_data
=
reinterpret_cast
<
const
CudaV
*>
(
scale
->
Data
<
V
>
());
auto
bias_data
=
(
simplified
||
(
nullptr
==
bias
))
?
nullptr
:
reinterpret_cast
<
const
CudaV
*>
(
bias
->
Data
<
V
>
());
const
TensorShape
&
x_shape
=
X
->
Shape
();
const
int64_t
axis
=
HandleNegativeAxis
(
axis_
,
x_shape
.
NumDimensions
());
int
n1
=
gsl
::
narrow
<
int
>
(
x_shape
.
SizeToDimension
(
axis
));
int
n2
=
gsl
::
narrow
<
int
>
(
x_shape
.
SizeFromDimension
(
axis
));
const
auto
scale_size
=
scale
->
Shape
().
Size
();
const
auto
bias_size
=
(
bias_data
)
?
bias
->
Shape
().
Size
()
:
0
;
if
(
n2
==
1
||
scale_size
!=
n2
||
(
bias_data
&&
bias_size
!=
n2
))
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Size of X.shape()[axis:] == "
,
n2
,
". Size of scale and bias (if provided) must match this "
"and the size must not be 1. Got scale size of "
,
scale_size
,
" and bias size of "
,
bias_size
);
}
// Outputs
Tensor
*
Y
=
ctx
->
Output
(
0
,
x_shape
);
auto
Y_data
=
reinterpret_cast
<
CudaV
*>
(
Y
->
MutableData
<
V
>
());
// Mean and variance
std
::
vector
<
int64_t
>
mean_inv_std_var_dim
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
x_shape
.
NumDimensions
());
++
i
)
{
if
(
i
<
axis
)
{
mean_inv_std_var_dim
.
emplace_back
(
x_shape
.
GetDims
()[
i
]);
}
else
{
mean_inv_std_var_dim
.
emplace_back
(
1
);
}
}
int
output_index
=
1
;
CudaU
*
mean_data
=
nullptr
;
if
(
!
simplified
)
{
Tensor
*
mean
=
ctx
->
Output
(
output_index
++
,
TensorShape
(
mean_inv_std_var_dim
));
if
(
mean
!=
nullptr
)
{
mean_data
=
reinterpret_cast
<
CudaU
*>
(
mean
->
MutableData
<
U
>
());
}
}
CudaU
*
inv_var_data
=
nullptr
;
Tensor
*
var
=
ctx
->
Output
(
output_index
,
TensorShape
(
mean_inv_std_var_dim
));
if
(
var
!=
nullptr
)
{
inv_var_data
=
reinterpret_cast
<
CudaU
*>
(
var
->
MutableData
<
U
>
());
}
if
(
x_shape
.
Size
()
==
0
)
{
return
Status
::
OK
();
}
HostApplyLayerNorm
<
HipT
,
CudaU
,
CudaV
,
simplified
>
(
GetDeviceProp
(),
Stream
(),
Y_data
,
mean_data
,
inv_var_data
,
X_data
,
n1
,
n2
,
epsilon_
,
scale_data
,
bias_data
);
return
Status
::
OK
();
}
#if !defined(DISABLE_CONTRIB_OPS)
#define LAYERNORM_IMPL(T, U, V, simplified) \
template class LayerNorm<T, U, V, simplified>;
// contrib op usage
LAYERNORM_IMPL
(
float
,
float
,
float
,
false
)
LAYERNORM_IMPL
(
double
,
double
,
double
,
false
)
LAYERNORM_IMPL
(
MLFloat16
,
float
,
MLFloat16
,
false
)
LAYERNORM_IMPL
(
float
,
float
,
MLFloat16
,
false
)
LAYERNORM_IMPL
(
MLFloat16
,
float
,
float
,
false
)
LAYERNORM_IMPL
(
BFloat16
,
float
,
BFloat16
,
false
)
LAYERNORM_IMPL
(
float
,
float
,
float
,
true
)
LAYERNORM_IMPL
(
double
,
double
,
double
,
true
)
LAYERNORM_IMPL
(
MLFloat16
,
float
,
MLFloat16
,
true
)
LAYERNORM_IMPL
(
float
,
float
,
MLFloat16
,
true
)
LAYERNORM_IMPL
(
MLFloat16
,
float
,
float
,
true
)
LAYERNORM_IMPL
(
BFloat16
,
float
,
BFloat16
,
true
)
#endif
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
// NOTE: This was originally a contrib op with 3 type constraints. The ONNX spec merges 'T' and 'V'.
// the kernel is templatized on all three for backwards compatibility, but in ONNX usage T == V.
template
<
typename
T
,
typename
U
,
typename
V
,
bool
simplified
>
class
LayerNorm
final
:
public
RocmKernel
{
public:
LayerNorm
(
const
OpKernelInfo
&
op_kernel_info
);
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
;
private:
int64_t
axis_
;
double
epsilon_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#include "core/providers/rocm/cu_inc/common.cuh"
#include "layer_norm_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
template
<
typename
U
,
bool
simplified
>
__device__
void
cuWelfordOnlineSum
(
const
U
curr
,
U
&
mu
,
U
&
sigma2
,
U
&
count
)
{
count
=
count
+
U
(
1
);
U
delta
=
curr
-
mu
;
U
lmean
=
mu
+
delta
/
count
;
mu
=
lmean
;
if
(
simplified
)
{
sigma2
=
sigma2
+
curr
*
curr
;
}
else
{
U
delta2
=
curr
-
lmean
;
sigma2
=
sigma2
+
delta
*
delta2
;
}
}
template
<
typename
U
,
bool
simplified
>
__device__
void
cuChanOnlineSum
(
const
U
muB
,
const
U
sigma2B
,
const
U
countB
,
U
&
mu
,
U
&
sigma2
,
U
&
count
)
{
U
delta
=
muB
-
mu
;
U
nA
=
count
;
U
nB
=
countB
;
count
=
count
+
countB
;
U
nX
=
count
;
if
(
nX
>
U
(
0
))
{
nA
=
nA
/
nX
;
nB
=
nB
/
nX
;
mu
=
nA
*
mu
+
nB
*
muB
;
if
(
simplified
)
{
sigma2
=
sigma2
+
sigma2B
;
}
else
{
sigma2
=
sigma2
+
sigma2B
+
delta
*
delta
*
nA
*
nB
*
nX
;
}
}
else
{
mu
=
U
(
0
);
sigma2
=
U
(
0
);
}
}
template
<
typename
T
,
typename
U
,
bool
simplified
>
__device__
void
cuWelfordMuSigma2
(
const
T
*
__restrict__
vals
,
const
int
n1
,
const
int
n2
,
const
int
i1
,
U
&
mu
,
U
&
sigma2
,
U
*
buf
)
{
// Assumptions:
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
// compute variance and mean over n2
U
count
=
U
(
0
);
mu
=
U
(
0
);
sigma2
=
U
(
0
);
if
(
i1
<
n1
)
{
// one warp normalizes one n1 index,
// synchronization is implicit
// initialize with standard Welford algorithm
const
int
numx
=
blockDim
.
x
*
blockDim
.
y
;
const
int
thrx
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
const
T
*
lvals
=
vals
+
i1
*
n2
;
int
l
=
4
*
thrx
;
for
(;
l
+
3
<
n2
;
l
+=
4
*
numx
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
U
curr
=
static_cast
<
U
>
(
lvals
[
l
+
k
]);
cuWelfordOnlineSum
<
U
,
simplified
>
(
curr
,
mu
,
sigma2
,
count
);
}
}
for
(;
l
<
n2
;
++
l
)
{
U
curr
=
static_cast
<
U
>
(
lvals
[
l
]);
cuWelfordOnlineSum
<
U
,
simplified
>
(
curr
,
mu
,
sigma2
,
count
);
}
// intra-warp reductions
#pragma unroll
for
(
int
stride
=
GPU_WARP_SIZE
/
2
;
stride
>
0
;
stride
/=
2
)
{
U
muB
=
WARP_SHFL_DOWN
(
mu
,
stride
);
U
countB
=
WARP_SHFL_DOWN
(
count
,
stride
);
U
sigma2B
=
WARP_SHFL_DOWN
(
sigma2
,
stride
);
cuChanOnlineSum
<
U
,
simplified
>
(
muB
,
sigma2B
,
countB
,
mu
,
sigma2
,
count
);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
if
(
blockDim
.
y
>
1
)
{
U
*
ubuf
=
(
U
*
)
buf
;
U
*
ibuf
=
(
U
*
)(
ubuf
+
blockDim
.
y
);
for
(
int
offset
=
blockDim
.
y
/
2
;
offset
>
0
;
offset
/=
2
)
{
// upper half of warps write to shared
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
>=
offset
&&
threadIdx
.
y
<
2
*
offset
)
{
const
int
wrt_y
=
threadIdx
.
y
-
offset
;
ubuf
[
2
*
wrt_y
]
=
mu
;
ubuf
[
2
*
wrt_y
+
1
]
=
sigma2
;
ibuf
[
wrt_y
]
=
count
;
}
__syncthreads
();
// lower half merges
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
<
offset
)
{
U
muB
=
ubuf
[
2
*
threadIdx
.
y
];
U
sigma2B
=
ubuf
[
2
*
threadIdx
.
y
+
1
];
U
countB
=
ibuf
[
threadIdx
.
y
];
cuChanOnlineSum
<
U
,
simplified
>
(
muB
,
sigma2B
,
countB
,
mu
,
sigma2
,
count
);
}
__syncthreads
();
}
// threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
ubuf
[
0
]
=
mu
;
ubuf
[
1
]
=
sigma2
;
}
__syncthreads
();
mu
=
ubuf
[
0
];
sigma2
=
ubuf
[
1
]
/
U
(
n2
);
// don't care about final value of count, we know count == n2
}
else
{
mu
=
WARP_SHFL
(
mu
,
0
);
sigma2
=
WARP_SHFL
(
sigma2
/
U
(
n2
),
0
);
}
}
}
template
<
bool
simplified
>
__device__
void
cuWelfordMuSigma2
(
const
half
*
__restrict__
vals
,
const
int
n1
,
const
int
n2
,
const
int
i1
,
float
&
mu
,
float
&
sigma2
,
float
*
buf
)
{
// Assumptions:
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
// compute variance and mean over n2
float
count
=
0.0
f
;
mu
=
float
(
0
);
sigma2
=
float
(
0
);
if
(
i1
<
n1
)
{
// one warp normalizes one n1 index,
// synchronization is implicit
// initialize with standard Welford algorithm
const
int
numx
=
blockDim
.
x
*
blockDim
.
y
;
const
int
thrx
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
const
half
*
lvals
=
vals
+
i1
*
n2
;
int
l
=
8
*
thrx
;
if
((((
size_t
)
lvals
)
&
3
)
!=
0
)
{
// 16 bit alignment
// first thread consumes first point
if
(
thrx
==
0
)
{
float
curr
=
static_cast
<
float
>
(
lvals
[
0
]);
cuWelfordOnlineSum
<
float
,
simplified
>
(
curr
,
mu
,
sigma2
,
count
);
}
++
l
;
}
// at this point, lvals[l] are 32 bit aligned for all threads.
for
(;
l
+
7
<
n2
;
l
+=
8
*
numx
)
{
for
(
int
k
=
0
;
k
<
8
;
k
+=
2
)
{
float2
curr
=
__half22float2
(
*
((
__half2
*
)(
lvals
+
l
+
k
)));
cuWelfordOnlineSum
<
float
,
simplified
>
(
curr
.
x
,
mu
,
sigma2
,
count
);
cuWelfordOnlineSum
<
float
,
simplified
>
(
curr
.
y
,
mu
,
sigma2
,
count
);
}
}
for
(;
l
<
n2
;
++
l
)
{
float
curr
=
static_cast
<
float
>
(
lvals
[
l
]);
cuWelfordOnlineSum
<
float
,
simplified
>
(
curr
,
mu
,
sigma2
,
count
);
}
// intra-warp reductions
#pragma unroll
for
(
int
stride
=
GPU_WARP_SIZE
/
2
;
stride
>
0
;
stride
/=
2
)
{
float
muB
=
WARP_SHFL_DOWN
(
mu
,
stride
);
float
countB
=
WARP_SHFL_DOWN
(
count
,
stride
);
float
sigma2B
=
WARP_SHFL_DOWN
(
sigma2
,
stride
);
cuChanOnlineSum
<
float
,
simplified
>
(
muB
,
sigma2B
,
countB
,
mu
,
sigma2
,
count
);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
if
(
blockDim
.
y
>
1
)
{
float
*
ubuf
=
(
float
*
)
buf
;
float
*
ibuf
=
(
float
*
)(
ubuf
+
blockDim
.
y
);
for
(
int
offset
=
blockDim
.
y
/
2
;
offset
>
0
;
offset
/=
2
)
{
// upper half of warps write to shared
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
>=
offset
&&
threadIdx
.
y
<
2
*
offset
)
{
const
int
wrt_y
=
threadIdx
.
y
-
offset
;
ubuf
[
2
*
wrt_y
]
=
mu
;
ubuf
[
2
*
wrt_y
+
1
]
=
sigma2
;
ibuf
[
wrt_y
]
=
count
;
}
__syncthreads
();
// lower half merges
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
<
offset
)
{
float
muB
=
ubuf
[
2
*
threadIdx
.
y
];
float
sigma2B
=
ubuf
[
2
*
threadIdx
.
y
+
1
];
float
countB
=
ibuf
[
threadIdx
.
y
];
cuChanOnlineSum
<
float
,
simplified
>
(
muB
,
sigma2B
,
countB
,
mu
,
sigma2
,
count
);
}
__syncthreads
();
}
// threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
ubuf
[
0
]
=
mu
;
ubuf
[
1
]
=
sigma2
;
}
__syncthreads
();
mu
=
ubuf
[
0
];
sigma2
=
ubuf
[
1
]
/
float
(
n2
);
// don't care about final value of count, we know count == n2
}
else
{
mu
=
WARP_SHFL
(
mu
,
0
);
sigma2
=
WARP_SHFL
(
sigma2
/
float
(
n2
),
0
);
}
}
}
template
<
typename
U
>
__device__
U
rsqrt
(
U
v
)
{
return
U
(
1
)
/
sqrt
(
v
);
}
template
<
>
__device__
float
rsqrt
(
float
v
)
{
return
rsqrtf
(
v
);
}
template
<
>
__device__
double
rsqrt
(
double
v
)
{
return
rsqrt
(
v
);
}
namespace
{
// This is the un-specialized struct. Note that we prevent instantiation of this
// struct by putting an undefined symbol in the function body so it won't compile.
// template <typename T>
// struct SharedMemory
// {
// // Ensure that we won't compile any un-specialized types
// __device__ T *getPointer()
// {
// extern __device__ void error(void);
// error();
// return NULL;
// }
// };
// https://github.com/NVIDIA/apex/issues/246
template
<
typename
T
>
struct
SharedMemory
;
template
<
>
struct
SharedMemory
<
float
>
{
__device__
float
*
getPointer
()
{
extern
__shared__
float
s_float
[];
return
s_float
;
}
};
template
<
>
struct
SharedMemory
<
double
>
{
__device__
double
*
getPointer
()
{
extern
__shared__
double
s_double
[];
return
s_double
;
}
};
}
// namespace
template
<
typename
T
,
typename
U
,
typename
V
,
bool
simplified
>
__global__
void
cuApplyLayerNorm
(
V
*
__restrict__
output_vals
,
U
*
__restrict__
mean
,
U
*
__restrict__
inv_std_dev
,
const
T
*
__restrict__
vals
,
const
int
n1
,
const
int
n2
,
const
U
epsilon
,
const
V
*
__restrict__
gamma
,
const
V
*
__restrict__
beta
)
{
// Assumptions:
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensors are contiguous
//
for
(
int
i1
=
blockIdx
.
y
;
i1
<
n1
;
i1
+=
gridDim
.
y
)
{
SharedMemory
<
U
>
shared
;
U
*
buf
=
shared
.
getPointer
();
U
mu
,
sigma2
;
cuWelfordMuSigma2
<
T
,
U
,
simplified
>
(
vals
,
n1
,
n2
,
i1
,
mu
,
sigma2
,
buf
);
const
T
*
lvals
=
vals
+
i1
*
n2
;
V
*
ovals
=
output_vals
+
i1
*
n2
;
U
c_inv_std_dev
=
rsqrt
(
sigma2
+
epsilon
);
const
int
numx
=
blockDim
.
x
*
blockDim
.
y
;
const
int
thrx
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
for
(
int
i
=
thrx
;
i
<
n2
;
i
+=
numx
)
{
U
curr
=
static_cast
<
U
>
(
lvals
[
i
]);
V
gamma_i
=
(
gamma
!=
NULL
)
?
gamma
[
i
]
:
(
V
)
1
;
V
beta_i
=
(
beta
!=
NULL
)
?
beta
[
i
]
:
(
V
)
0
;
if
(
simplified
)
{
ovals
[
i
]
=
gamma_i
*
static_cast
<
V
>
(
c_inv_std_dev
*
curr
);
}
else
{
ovals
[
i
]
=
gamma_i
*
static_cast
<
V
>
(
c_inv_std_dev
*
(
curr
-
mu
))
+
beta_i
;
}
}
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
if
(
mean
!=
nullptr
)
mean
[
i1
]
=
mu
;
if
(
inv_std_dev
!=
nullptr
)
inv_std_dev
[
i1
]
=
c_inv_std_dev
;
}
}
}
template
<
typename
T
,
typename
U
,
typename
V
,
bool
simplified
>
void
HostApplyLayerNorm
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
V
*
output
,
U
*
mean
,
U
*
inv_std_dev
,
const
T
*
input
,
int
n1
,
int
n2
,
double
epsilon
,
const
V
*
gamma
,
const
V
*
beta
)
{
const
int
maxGridY
=
prop
.
maxGridSize
[
1
];
const
int
warp_size
=
prop
.
warpSize
;
ORT_ENFORCE
(
warp_size
==
GPU_WARP_SIZE_HOST
);
dim3
threads
(
warp_size
,
4
,
1
);
#ifdef __HIP_PLATFORM_HCC__
// Optimization for ROCm MI100
threads
.
y
=
1
;
#endif
const
dim3
blocks
(
1
,
std
::
min
<
unsigned
int
>
(
n1
,
maxGridY
),
1
);
int
nshared
=
threads
.
y
>
1
?
threads
.
y
*
sizeof
(
U
)
+
(
threads
.
y
/
2
)
*
sizeof
(
U
)
:
0
;
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
cuApplyLayerNorm
<
T
,
U
,
V
,
simplified
>
),
blocks
,
threads
,
nshared
,
stream
,
output
,
mean
,
inv_std_dev
,
input
,
n1
,
n2
,
U
(
epsilon
),
gamma
,
beta
);
}
#define LAYERNORM_LINEAR_IMPL(T, U, V, simplified) \
template void HostApplyLayerNorm<T, U, V, simplified>(const hipDeviceProp_t& prop, hipStream_t stream, V* output, \
U* mean, U* inv_std_dev, const T* input, int n1, int n2, \
double epsilon, const V* gamma, const V* beta);
LAYERNORM_LINEAR_IMPL
(
float
,
float
,
float
,
true
)
LAYERNORM_LINEAR_IMPL
(
half
,
float
,
half
,
true
)
LAYERNORM_LINEAR_IMPL
(
double
,
double
,
double
,
true
)
LAYERNORM_LINEAR_IMPL
(
float
,
float
,
half
,
true
)
LAYERNORM_LINEAR_IMPL
(
half
,
float
,
float
,
true
)
LAYERNORM_LINEAR_IMPL
(
float
,
float
,
float
,
false
)
LAYERNORM_LINEAR_IMPL
(
half
,
float
,
half
,
false
)
LAYERNORM_LINEAR_IMPL
(
double
,
double
,
double
,
false
)
LAYERNORM_LINEAR_IMPL
(
double
,
float
,
double
,
false
)
LAYERNORM_LINEAR_IMPL
(
float
,
float
,
half
,
false
)
LAYERNORM_LINEAR_IMPL
(
half
,
float
,
float
,
false
)
LAYERNORM_LINEAR_IMPL
(
BFloat16
,
float
,
BFloat16
,
true
)
LAYERNORM_LINEAR_IMPL
(
BFloat16
,
float
,
BFloat16
,
false
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/layer_norm_impl.h
0 → 100644
View file @
1a91fcc2
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
//
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// NVIDIA/apex is licensed under the
// BSD 3 - Clause "New" or "Revised" License
//
/* Modifications Copyright (c) Microsoft. */
#pragma once
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
typename
U
,
typename
V
,
bool
simplified
>
void
HostApplyLayerNorm
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
V
*
output
,
U
*
mean
,
U
*
invvar
,
const
T
*
input
,
int
n1
,
int
n2
,
double
epsilon
,
const
V
*
gamma
,
const
V
*
beta
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "lrn.h"
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_VERSIONED_TYPED(START_VER, END_VER, T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
LRN, \
kOnnxDomain, \
START_VER, \
END_VER, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
LRN<T>);
#define REGISTER_KERNEL_TYPED(VER, T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
LRN, \
kOnnxDomain, \
VER, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
LRN<T>);
REGISTER_KERNEL_VERSIONED_TYPED
(
1
,
12
,
float
)
REGISTER_KERNEL_VERSIONED_TYPED
(
1
,
12
,
double
)
REGISTER_KERNEL_VERSIONED_TYPED
(
1
,
12
,
MLFloat16
)
REGISTER_KERNEL_TYPED
(
13
,
float
)
REGISTER_KERNEL_TYPED
(
13
,
double
)
REGISTER_KERNEL_TYPED
(
13
,
MLFloat16
)
template
<
typename
T
>
LRN
<
T
>::
LRN
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
size
;
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"size"
,
&
size
).
IsOK
());
ORT_ENFORCE
(
size
>
0
);
ORT_ENFORCE
(
size
%
2
==
1
);
float
alpha
;
float
beta
;
ORT_ENFORCE
(
info
.
GetAttr
<
float
>
(
"alpha"
,
&
alpha
).
IsOK
());
ORT_ENFORCE
(
alpha
>
0.0
f
);
ORT_ENFORCE
(
info
.
GetAttr
<
float
>
(
"beta"
,
&
beta
).
IsOK
());
ORT_ENFORCE
(
beta
>
0.0
f
);
float
bias
=
info
.
GetAttrOrDefault
<
float
>
(
"bias"
,
1.0
f
);
ORT_ENFORCE
(
norm_desc_
.
Set
(
gsl
::
narrow_cast
<
uint32_t
>
(
size
),
static_cast
<
double
>
(
alpha
),
static_cast
<
double
>
(
beta
),
static_cast
<
double
>
(
bias
))
.
IsOK
());
}
template
<
typename
T
>
Status
LRN
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
auto
rank
=
X
->
Shape
().
NumDimensions
();
if
(
rank
!=
4
&&
rank
!=
5
)
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"miopen LRN only supports 4D or 5D input"
);
Tensor
*
Y
=
context
->
Output
(
0
,
X
->
Shape
());
MiopenTensor
x_tensor
;
ORT_RETURN_IF_ERROR
(
x_tensor
.
Set
(
X
->
Shape
().
GetDims
(),
MiopenTensor
::
GetDataType
<
HipT
>
()));
const
auto
one
=
Consts
<
HipT
>::
One
;
const
auto
zero
=
Consts
<
HipT
>::
Zero
;
MIOPEN_RETURN_IF_ERROR
(
LRNCrossChannelForwardHelper
(
MiopenHandle
(),
norm_desc_
,
miopenLRNCrossChannel
,
&
one
,
x_tensor
,
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
()),
&
zero
,
x_tensor
,
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
())));
return
Status
::
OK
();
}
MiopenLRNDescriptor
::
MiopenLRNDescriptor
()
:
desc_
(
nullptr
)
{
}
MiopenLRNDescriptor
::~
MiopenLRNDescriptor
()
{
if
(
desc_
)
{
miopenDestroyLRNDescriptor
(
desc_
);
desc_
=
nullptr
;
}
}
Status
MiopenLRNDescriptor
::
Set
(
uint32_t
N
,
double
alpha
,
double
beta
,
double
K
)
{
if
(
!
desc_
)
MIOPEN_RETURN_IF_ERROR
(
miopenCreateLRNDescriptor
(
&
desc_
));
MIOPEN_RETURN_IF_ERROR
(
SetLRNDescriptorHelper
(
desc_
,
N
,
alpha
,
beta
,
K
));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/nn/lrn.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
namespace
onnxruntime
{
namespace
rocm
{
class
MiopenLRNDescriptor
final
{
public:
MiopenLRNDescriptor
();
~
MiopenLRNDescriptor
();
Status
Set
(
uint32_t
N
,
double
alpha
,
double
beta
,
double
K
);
operator
miopenLRNDescriptor_t
()
const
{
return
desc_
;
}
private:
miopenLRNDescriptor_t
desc_
;
};
template
<
typename
T
>
class
LRN
:
public
RocmKernel
{
public:
LRN
(
const
OpKernelInfo
&
info
);
Status
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
override
;
private:
MiopenLRNDescriptor
norm_desc_
;
};
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
…
3
4
5
6
7
8
9
10
11
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment