Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1499 additions
and
0 deletions
+1499
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/longformer_attention_softmax.h
...time/contrib_ops/rocm/bert/longformer_attention_softmax.h
+50
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/longformer_global_impl.cu
...nxruntime/contrib_ops/rocm/bert/longformer_global_impl.cu
+83
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/longformer_global_impl.h
...nnxruntime/contrib_ops/rocm/bert/longformer_global_impl.h
+27
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block.cc
...u/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block.cc
+71
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block.h
...pu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block.h
+25
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block_impl.cu
...xruntime/contrib_ops/rocm/bert/ngram_repeat_block_impl.cu
+85
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block_impl.h
...nxruntime/contrib_ops/rocm/bert/ngram_repeat_block_impl.h
+26
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/remove_padding.cc
...mdgpu/onnxruntime/contrib_ops/rocm/bert/remove_padding.cc
+114
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/remove_padding.h
...amdgpu/onnxruntime/contrib_ops/rocm/bert/remove_padding.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/restore_padding.cc
...dgpu/onnxruntime/contrib_ops/rocm/bert/restore_padding.cc
+82
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/restore_padding.h
...mdgpu/onnxruntime/contrib_ops/rocm/bert/restore_padding.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/transformer_rocm_common.h
...nxruntime/contrib_ops/rocm/bert/transformer_rocm_common.h
+33
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample.cc
...elease/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample.cc
+92
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample.h
...Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample.h
+29
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample_impl.cu
...e/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample_impl.cu
+234
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample_impl.h
...se/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample_impl.h
+26
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/layer_norm.cc
...Release/amdgpu/onnxruntime/contrib_ops/rocm/layer_norm.cc
+37
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout.cc
.../amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout.cc
+145
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout.h
...e/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout.h
+41
-0
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout_impl.cu
...pu/onnxruntime/contrib_ops/rocm/math/bias_dropout_impl.cu
+253
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/longformer_attention_softmax.h
0 → 100644
View file @
1a91fcc2
/*
Copyright (c) NVIDIA Corporation and Microsoft Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#pragma once
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
// Launch the softmax kernels that does not use compact memory.
Status
LaunchLongformerSoftmaxSimpleKernel
(
hipStream_t
stream
,
rocblas_handle
rocblas
,
void
*
workspace
,
// softmax space
const
void
*
q
,
// transposed Q with shape (B, N, S, H)
const
void
*
k
,
// transposed K with shape (B, N, S, H)
const
void
*
v
,
// transposed V with shape (B, N, S, H)
const
void
*
attention_mask
,
// attention mask with shape (B, S), with value 0.0 not masked, and -10000.0 masked.
const
void
*
global_q
,
// Q for global tokens with shape (B, N, S, H)
const
void
*
global_k
,
// K for global tokens with shape (B, N, S, H)
const
void
*
global_v
,
// V for global tokens with shape (B, N, S, H)
const
int
*
global_attention
,
// global attention flags with shape (B, S), with value 0 for local and 1 for global.
const
int
*
global_index
,
// Global index with shape (B, S)
const
int
*
batch_global_num
,
// Number of global tokens per batch with shape (B, 1)
void
*
pinned_buffer
,
// Pinned memory in CPU. Number of global tokens per batch with shape (B, 1)
void
*
output
,
// output with shape (B, N, S, H)
float
scaler
,
// scalar
int
batch_size
,
// batch size
int
sequence_length
,
// sequence length
int
num_heads
,
// number of heads
int
head_size
,
// hidden size per head
int
attention_window
,
// one sided windows size
size_t
element_size
);
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/longformer_global_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/*
Copyright (c) NVIDIA Corporation and Microsoft Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <hipcub/hipcub.hpp>
#include <hipcub/device/device_partition.hpp>
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "longformer_global_impl.h"
using
namespace
onnxruntime
::
rocm
;
using
namespace
hipcub
;
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
size_t
GetGlobalScratchSize
(
int
sequence_length
)
{
// Global Index scratch layout:
// [sequence_index: int S][tmp_storage: int 1024x1]
return
sizeof
(
int
)
*
(
sequence_length
+
1024
);
}
__global__
void
InitSequenceIndexKernel
(
int
*
sequence_index
,
int
sequence_length
)
{
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
sequence_length
;
i
+=
blockDim
.
x
)
{
sequence_index
[
i
]
=
i
;
}
}
Status
BuildGlobalIndex
(
const
hipDeviceProp_t
&
device_prop
,
hipStream_t
stream
,
const
int
*
global_attention
,
int
batch_size
,
int
sequence_length
,
int
*
global_index
,
int
*
batch_global_num
,
void
*
scratch
,
size_t
scratch_size
)
{
int
*
sequence_index
=
(
int
*
)
scratch
;
int
*
tmp_storage
=
sequence_index
+
sequence_length
;
const
int
threads
=
device_prop
.
maxThreadsPerBlock
;
int
blocks
=
CeilDiv
(
sequence_length
,
threads
);
hipLaunchKernelGGL
(
InitSequenceIndexKernel
,
blocks
,
threads
,
0
,
stream
,
sequence_index
,
sequence_length
);
// Determine temporary device storage size.
// For int* inputs/outputs, it need 767 bytes. We reserved 1024*4 bytes, which shall be enough.
size_t
temp_storage_bytes
=
0
;
HIP_RETURN_IF_ERROR
(
hipcub
::
DevicePartition
::
Flagged
(
NULL
,
temp_storage_bytes
,
sequence_index
,
global_attention
,
global_index
,
batch_global_num
,
sequence_length
,
stream
));
if
(
temp_storage_bytes
+
sizeof
(
int
)
*
sequence_length
>
scratch_size
)
{
ORT_THROW
(
"LongformerAttention scratch space is not large enough. Temp storage bytes are"
,
temp_storage_bytes
);
}
// Find the global attention indices and number of global attention tokens
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
HIP_RETURN_IF_ERROR
(
hipcub
::
DevicePartition
::
Flagged
(
reinterpret_cast
<
void
*>
(
tmp_storage
),
temp_storage_bytes
,
sequence_index
,
global_attention
+
i
*
sequence_length
,
global_index
+
i
*
sequence_length
,
batch_global_num
+
i
,
sequence_length
,
stream
));
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/longformer_global_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
// Size of global Index scratch in bytes.
size_t
GetGlobalScratchSize
(
int
sequence_length
);
// Find the global attention indices and number of global attention tokens
Status
BuildGlobalIndex
(
const
hipDeviceProp_t
&
device_prop
,
hipStream_t
stream
,
const
int
*
global_attention
,
int
batch_size
,
int
sequence_length
,
int
*
global_index
,
int
*
batch_global_num
,
void
*
scratch
,
size_t
scratch_size
);
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_common.h"
#include "ngram_repeat_block.h"
#include "ngram_repeat_block_impl.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
ONNX_OPERATOR_KERNEL_EX
(
NGramRepeatBlock
,
kMSDomain
,
1
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"Tid"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
GetTensorType
<
float
>
()),
NGramRepeatBlock
);
using
namespace
ONNX_NAMESPACE
;
NGramRepeatBlock
::
NGramRepeatBlock
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"ngram_size"
,
&
ngram_size_
).
IsOK
());
ORT_ENFORCE
(
ngram_size_
>
0
);
}
Status
NGramRepeatBlock
::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
Tensor
*
input_ids
=
context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
scores
=
context
->
Input
<
Tensor
>
(
1
);
Tensor
*
output
=
context
->
Output
(
0
,
scores
->
Shape
());
const
auto
*
scores_source
=
static_cast
<
const
float
*>
(
scores
->
DataRaw
());
auto
*
scores_target
=
static_cast
<
float
*>
(
output
->
MutableDataRaw
());
if
(
scores_source
!=
scores_target
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
scores_target
,
scores_source
,
scores
->
Shape
().
Size
()
*
sizeof
(
float
),
hipMemcpyDeviceToDevice
,
Stream
()));
}
const
auto
&
input_ids_dims
=
input_ids
->
Shape
().
GetDims
();
const
auto
&
scores_dims
=
scores
->
Shape
().
GetDims
();
ORT_ENFORCE
(
input_ids_dims
.
size
()
==
2
);
ORT_ENFORCE
(
scores_dims
.
size
()
==
2
);
int64_t
batch_size
=
input_ids_dims
[
0
];
int64_t
cur_len
=
input_ids_dims
[
1
];
ORT_ENFORCE
(
scores_dims
[
0
]
==
batch_size
);
int64_t
vocab_size
=
scores_dims
[
1
];
if
(
cur_len
+
1
<
ngram_size_
)
{
return
Status
::
OK
();
}
const
auto
*
input_ids_data
=
static_cast
<
const
int64_t
*>
(
input_ids
->
DataRaw
(
input_ids
->
DataType
()));
NGramRepeatBlockImpl
(
Stream
(),
input_ids_data
,
scores_target
,
gsl
::
narrow_cast
<
int
>
(
batch_size
),
gsl
::
narrow_cast
<
int
>
(
cur_len
-
1
),
gsl
::
narrow_cast
<
int
>
(
cur_len
),
gsl
::
narrow_cast
<
int
>
(
vocab_size
),
gsl
::
narrow_cast
<
int
>
(
1
),
gsl
::
narrow_cast
<
int
>
(
ngram_size_
));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
class
NGramRepeatBlock
final
:
public
RocmKernel
{
public:
NGramRepeatBlock
(
const
OpKernelInfo
&
op_kernel_info
);
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
;
private:
int64_t
ngram_size_
;
};
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/*
Copyright (c) Microsoft Corporation.
Licensed under the MIT License.
*/
/*
Kernel implementation for blocking repeated n-grams.
*/
#include "core/providers/rocm/cu_inc/common.cuh"
#include "contrib_ops/rocm/bert/ngram_repeat_block_impl.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
// Ban repeated ngrams of length = 'no_repeat_ngram_size'
__global__
void
banRepeatedTokens
(
const
int64_t
*
__restrict__
tokens
,
float
*
__restrict__
lprobs
,
int
max_predict_len
,
int
vocab_size
,
int
no_repeat_ngram_size
)
{
auto
row
=
blockIdx
.
x
;
auto
col
=
threadIdx
.
x
;
auto
start
=
row
*
(
max_predict_len
)
+
col
;
// Each thread compares ngram starting from
// thread index with final ngram starting from
// step - no_repeat_ngram_size +2
auto
check_start_pos
=
blockDim
.
x
;
auto
lprob_start
=
row
*
vocab_size
;
bool
is_banned
=
true
;
extern
__shared__
int64_t
tokens_shm
[];
tokens_shm
[
col
]
=
tokens
[
start
];
if
(
col
==
blockDim
.
x
-
1
)
{
for
(
int
i
=
1
;
i
<
no_repeat_ngram_size
;
i
++
)
{
if
(
col
+
i
<
max_predict_len
)
{
tokens_shm
[
col
+
i
]
=
tokens
[
start
+
i
];
}
}
}
__syncthreads
();
for
(
int
k
=
0
;
k
<
no_repeat_ngram_size
-
1
;
k
++
)
{
if
(
tokens_shm
[
col
+
k
]
!=
tokens_shm
[
check_start_pos
+
k
])
{
is_banned
=
false
;
}
}
if
(
is_banned
==
true
)
{
auto
token_to_be_banned
=
tokens_shm
[
col
+
no_repeat_ngram_size
-
1
];
lprobs
[
lprob_start
+
token_to_be_banned
]
=
-
INFINITY
;
}
}
// Allocate blocks and threads based on
// batch size and sequence length and launch
// kernel
void
NGramRepeatBlockImpl
(
hipStream_t
stream
,
const
int64_t
*
tokens_ptr
,
float
*
scores_ptr
,
int
bsz
,
int
step
,
int
max_predict_len
,
int
vocab_size
,
int
beam_size
,
int
no_repeat_ngram_size
)
{
int
threads
=
step
-
no_repeat_ngram_size
+
2
;
if
(
threads
<=
0
)
return
;
int
blocks
=
bsz
*
beam_size
;
int
shared_mem_size
=
(
step
+
1
)
*
sizeof
(
int64_t
);
// Launching N blocks where N is number of samples in a batch (beams*bsz)
// Launching T threads where T is number of previous ngrams in a sample
// Allocating shared mem per block for fastser access of input tokens since
// each token will be accessed N times to compare with current Ngram where
// N is Ngram size.
hipLaunchKernelGGL
(
banRepeatedTokens
,
blocks
,
threads
,
shared_mem_size
,
stream
,
tokens_ptr
,
scores_ptr
,
max_predict_len
,
vocab_size
,
no_repeat_ngram_size
);
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/ngram_repeat_block_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
void
NGramRepeatBlockImpl
(
hipStream_t
stream
,
const
int64_t
*
tokens_ptr
,
float
*
scores_ptr
,
int
bsz
,
int
step
,
int
max_predict_len
,
int
vocab_size
,
int
beam_size
,
int
no_repeat_ngram_size
);
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/remove_padding.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_common.h"
#include "contrib_ops/rocm/bert/remove_padding.h"
#include "contrib_ops/rocm/bert/bert_padding.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
RemovePadding, \
kMSDomain, \
1, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.OutputMemoryType(OrtMemTypeCPUOutput, 3)
/*max_token_count on CPU*/
\
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
RemovePadding<T>);
REGISTER_KERNEL_TYPED
(
float
)
REGISTER_KERNEL_TYPED
(
MLFloat16
)
using
namespace
ONNX_NAMESPACE
;
template
<
typename
T
>
RemovePadding
<
T
>::
RemovePadding
(
const
OpKernelInfo
&
op_kernel_info
)
:
RocmKernel
(
op_kernel_info
)
{
}
template
<
typename
T
>
Status
RemovePadding
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
// shape of inputs:
// input: (batch_size, sequence_length, hidden_size)
// sequence_token_count: (batch_size)
// shape of outputs:
// output: (total_tokens, hidden_size)
// token_offset: (batch_size, sequence_length)
// cumulated_seq_len: (batch_size + 1)
// max_token_count: (1)
const
Tensor
*
input
=
context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
sequence_token_count
=
context
->
Input
<
Tensor
>
(
1
);
const
auto
&
dims
=
input
->
Shape
().
GetDims
();
if
(
dims
.
size
()
!=
3
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Input 'input' is expected to have 3 dimensions, got "
,
dims
.
size
());
}
int64_t
batch_size
=
dims
[
0
];
int64_t
sequence_length
=
dims
[
1
];
int64_t
hidden_size
=
dims
[
2
];
auto
token_count_buffer
=
GetScratchBuffer
<
int
>
(
2
);
TensorShapeVector
token_offset_shape
(
2
);
token_offset_shape
[
0
]
=
batch_size
;
token_offset_shape
[
1
]
=
sequence_length
;
Tensor
*
token_offset
=
context
->
Output
(
1
,
token_offset_shape
);
TensorShapeVector
cumulated_seq_len_shape
(
1
);
cumulated_seq_len_shape
[
0
]
=
batch_size
+
static_cast
<
int64_t
>
(
1
);
Tensor
*
cumulated_seq_len
=
context
->
Output
(
2
,
cumulated_seq_len_shape
);
LaunchGetTokenOffset
(
token_count_buffer
.
get
(),
token_offset
->
MutableData
<
int
>
(),
cumulated_seq_len
->
MutableData
<
int
>
(),
sequence_token_count
->
Data
<
int
>
(),
static_cast
<
int
>
(
batch_size
),
static_cast
<
int
>
(
sequence_length
),
Stream
());
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
// Copy token_count to CPU
auto
pinned_buffer
=
AllocateBufferOnCPUPinned
<
int
>
(
2
);
int
*
token_count_pinned
=
pinned_buffer
.
get
();
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
token_count_pinned
,
token_count_buffer
.
get
(),
sizeof
(
int
)
*
2
,
hipMemcpyDeviceToHost
,
Stream
()));
// Wait until token_count is copied to host.
HIP_RETURN_IF_ERROR
(
hipStreamSynchronize
(
Stream
()));
int
total_token_count
=
token_count_pinned
[
0
];
int
max_token_count
=
token_count_pinned
[
1
];
TensorShapeVector
output_shape
(
2
);
output_shape
[
0
]
=
static_cast
<
int64_t
>
(
total_token_count
);
output_shape
[
1
]
=
hidden_size
;
Tensor
*
output
=
context
->
Output
(
0
,
output_shape
);
TensorShapeVector
max_token_count_shape
(
1
);
max_token_count_shape
[
0
]
=
1
;
Tensor
*
max_token_count_tensor
=
context
->
Output
(
3
,
max_token_count_shape
);
max_token_count_tensor
->
MutableData
<
int
>
()[
0
]
=
max_token_count
;
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
LaunchRemovePadding
<
HipT
>
(
reinterpret_cast
<
HipT
*>
(
output
->
MutableData
<
T
>
()),
reinterpret_cast
<
const
HipT
*>
(
input
->
Data
<
T
>
()),
token_offset
->
Data
<
int
>
(),
total_token_count
,
static_cast
<
int
>
(
hidden_size
),
Stream
());
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/remove_padding.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
template
<
typename
T
>
class
RemovePadding
final
:
public
RocmKernel
{
public:
RemovePadding
(
const
OpKernelInfo
&
op_kernel_info
);
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
;
};
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/restore_padding.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_common.h"
#include "contrib_ops/rocm/bert/restore_padding.h"
#include "contrib_ops/rocm/bert/bert_padding.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
RestorePadding, \
kMSDomain, \
1, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
RestorePadding<T>);
REGISTER_KERNEL_TYPED
(
float
)
REGISTER_KERNEL_TYPED
(
MLFloat16
)
using
namespace
ONNX_NAMESPACE
;
template
<
typename
T
>
RestorePadding
<
T
>::
RestorePadding
(
const
OpKernelInfo
&
op_kernel_info
)
:
RocmKernel
(
op_kernel_info
)
{
}
template
<
typename
T
>
Status
RestorePadding
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
// shape of inputs:
// input: (total_tokens, hidden_size)
// token_offset: (batch_size, sequence_length)
// shape of outputs:
// output: (batch_size, sequence_length, hidden_size)
const
Tensor
*
input
=
context
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
token_offset
=
context
->
Input
<
Tensor
>
(
1
);
const
auto
&
dims
=
input
->
Shape
().
GetDims
();
if
(
dims
.
size
()
!=
2
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Input 'input' is expected to have 2 dimensions, got "
,
dims
.
size
());
}
int64_t
total_tokens
=
dims
[
0
];
int64_t
hidden_size
=
dims
[
1
];
const
auto
&
token_offset_dims
=
token_offset
->
Shape
().
GetDims
();
if
(
token_offset_dims
.
size
()
!=
2
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Input 'token_offset' is expected to have 2 dimensions, got "
,
token_offset_dims
.
size
());
}
int64_t
batch_size
=
token_offset_dims
[
0
];
int64_t
sequence_length
=
token_offset_dims
[
1
];
TensorShapeVector
output_shape
(
3
);
output_shape
[
0
]
=
batch_size
;
output_shape
[
1
]
=
sequence_length
;
output_shape
[
2
]
=
hidden_size
;
Tensor
*
output
=
context
->
Output
(
0
,
output_shape
);
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
LaunchRestorePadding
<
HipT
>
(
reinterpret_cast
<
HipT
*>
(
output
->
MutableData
<
T
>
()),
reinterpret_cast
<
const
HipT
*>
(
input
->
Data
<
T
>
()),
token_offset
->
Data
<
int
>
(),
static_cast
<
int
>
(
total_tokens
),
static_cast
<
int
>
(
hidden_size
),
static_cast
<
int
>
(
batch_size
),
static_cast
<
int
>
(
sequence_length
),
Stream
());
HIP_RETURN_IF_ERROR
(
hipGetLastError
());
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/restore_padding.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
template
<
typename
T
>
class
RestorePadding
final
:
public
RocmKernel
{
public:
RestorePadding
(
const
OpKernelInfo
&
op_kernel_info
);
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
;
};
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/bert/transformer_rocm_common.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
// A wrapper class of hipEvent_t to destroy the event automatically for avoiding memory leak.
class
AutoDestoryCudaEvent
{
public:
AutoDestoryCudaEvent
()
:
rocm_event_
(
nullptr
)
{
}
~
AutoDestoryCudaEvent
()
{
if
(
rocm_event_
!=
nullptr
)
(
void
)
hipEventDestroy
(
rocm_event_
);
}
hipEvent_t
&
Get
()
{
return
rocm_event_
;
}
private:
hipEvent_t
rocm_event_
;
};
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "grid_sample.h"
#include "grid_sample_impl.h"
using
namespace
onnxruntime
::
rocm
;
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
GridSample, \
kMSDomain, \
1, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
GridSample<T>);
REGISTER_KERNEL_TYPED
(
float
)
template
<
typename
T
>
GridSample
<
T
>::
GridSample
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
std
::
string
mode_str
=
info
.
GetAttrOrDefault
<
std
::
string
>
(
"mode"
,
"bilinear"
);
std
::
string
padding_mode_str
=
info
.
GetAttrOrDefault
<
std
::
string
>
(
"padding_mode"
,
"zeros"
);
align_corners_
=
static_cast
<
bool
>
(
info
.
GetAttrOrDefault
<
int64_t
>
(
"align_corners"
,
0
));
ORT_ENFORCE
(
mode_str
==
"bilinear"
||
mode_str
==
"nearest"
||
mode_str
==
"bicubic"
,
"mode
\"
"
,
mode_str
,
"
\"
not supported, expect bilinear, nearest or bicubic"
);
ORT_ENFORCE
(
padding_mode_str
==
"zeros"
||
padding_mode_str
==
"border"
||
padding_mode_str
==
"reflection"
,
"padding_mode
\"
"
,
padding_mode_str
,
"
\"
not supported, expect zeros, border or reflection"
);
if
(
mode_str
==
"bicubic"
)
{
mode_i_
=
2
;
}
else
if
(
mode_str
==
"nearest"
)
{
mode_i_
=
1
;
}
else
{
mode_i_
=
0
;
}
if
(
padding_mode_str
==
"reflection"
)
{
padding_mode_i_
=
2
;
}
else
if
(
padding_mode_str
==
"border"
)
{
padding_mode_i_
=
1
;
}
else
{
padding_mode_i_
=
0
;
}
}
template
<
typename
T
>
Status
GridSample
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
auto
&
dims_input
=
X
->
Shape
().
GetDims
();
const
Tensor
*
Grid
=
context
->
Input
<
Tensor
>
(
1
);
const
auto
&
dims_grid
=
Grid
->
Shape
().
GetDims
();
if
(
dims_input
.
size
()
!=
4
||
dims_grid
.
size
()
!=
4
)
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
INVALID_ARGUMENT
,
"Only 4-D tensor is supported"
);
}
ORT_ENFORCE
(
dims_grid
[
0
]
==
dims_input
[
0
],
"Grid batch size "
,
dims_grid
[
0
],
" does not match input batch size "
,
dims_input
[
0
]);
ORT_ENFORCE
(
dims_grid
[
3
]
==
2
,
"Last dimension of grid: "
,
dims_grid
[
3
],
", expect 2"
);
TensorShapeVector
dims_output
(
4
);
dims_output
[
0
]
=
dims_input
[
0
];
dims_output
[
1
]
=
dims_input
[
1
];
dims_output
[
2
]
=
dims_grid
[
1
];
dims_output
[
3
]
=
dims_grid
[
2
];
Tensor
*
Y
=
context
->
Output
(
0
,
dims_output
);
// Return early if the output tensor is going to be of size 0
if
(
Y
->
Shape
().
Size
()
==
0
)
{
return
Status
::
OK
();
}
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
HipT
*
Y_data
=
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
());
GridSampleImpl
<
HipT
>
(
Stream
(),
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
()),
reinterpret_cast
<
const
HipT
*>
(
Grid
->
Data
<
T
>
()),
mode_i_
,
padding_mode_i_
,
align_corners_
,
dims_input
.
data
(),
dims_grid
[
1
],
dims_grid
[
2
],
Y_data
);
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
using
namespace
onnxruntime
::
rocm
;
template
<
typename
T
>
class
GridSample
final
:
public
RocmKernel
{
public:
explicit
GridSample
(
const
OpKernelInfo
&
info
);
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
int64_t
mode_i_
;
// 0: bilinear (default), 1: nearest 2: bicubic
int64_t
padding_mode_i_
;
// 0:'zeros', 1: 'border', 2:'reflection'
int64_t
align_corners_
;
};
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "grid_sample_impl.h"
using
namespace
onnxruntime
::
rocm
;
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
template
<
typename
T
>
__device__
T
GsDenormalize
(
T
n
,
int64_t
length
,
bool
align_corners
)
{
T
x
=
{};
if
(
align_corners
)
{
// align_corners: true => [-1, 1] to [0, length - 1]
x
=
(
n
+
static_cast
<
T
>
(
1
))
/
static_cast
<
T
>
(
2
)
*
(
length
-
1
);
}
else
{
// align_corners: false => [-1, 1] to [-0.5, length - 0.5]
x
=
((
n
+
static_cast
<
T
>
(
1
))
*
length
-
static_cast
<
T
>
(
1
))
/
static_cast
<
T
>
(
2
);
}
return
x
;
}
template
<
typename
T
>
__device__
T
GsReflect
(
T
x
,
float
x_min
,
float
x_max
)
{
float
fx
=
static_cast
<
float
>
(
x
);
float
dx
=
{};
float
range
=
x_max
-
x_min
;
if
(
fx
<
x_min
)
{
dx
=
x_min
-
fx
;
int
n
=
static_cast
<
int
>
(
dx
/
range
);
float
r
=
dx
-
n
*
range
;
if
(
n
%
2
==
0
)
{
fx
=
x_min
+
r
;
}
else
{
fx
=
x_max
-
r
;
}
}
else
if
(
fx
>
x_max
)
{
dx
=
fx
-
x_max
;
int
n
=
static_cast
<
int
>
(
dx
/
range
);
float
r
=
dx
-
n
*
range
;
if
(
n
%
2
==
0
)
{
fx
=
x_max
-
r
;
}
else
{
fx
=
x_min
+
r
;
}
}
// else fallthrough
return
static_cast
<
T
>
(
fx
);
}
template
<
typename
T
>
__device__
T
PixelAtGrid
(
const
T
*
input_data
,
int64_t
bIdx
,
int64_t
cIdx
,
int64_t
y
,
int64_t
x
,
int64_t
padding_mode
,
int64_t
N
,
int64_t
C
,
int64_t
H
,
int64_t
W
,
float
border
[
4
])
{
T
pixel
=
0.0
f
;
if
(
padding_mode
==
0
)
{
// zeros
if
(
x
>=
0
&&
x
<
W
&&
y
>=
0
&&
y
<
H
)
{
pixel
=
input_data
[
bIdx
*
C
*
H
*
W
+
cIdx
*
H
*
W
+
y
*
W
+
x
];
}
}
else
if
(
padding_mode
==
1
)
{
//border
x
=
max
((
int64_t
)
0
,
min
((
int64_t
)
W
-
1
,
(
int64_t
)
x
));
y
=
max
((
int64_t
)
0
,
min
((
int64_t
)
H
-
1
,
(
int64_t
)
y
));
pixel
=
input_data
[
bIdx
*
C
*
H
*
W
+
cIdx
*
H
*
W
+
y
*
W
+
x
];
}
else
{
// Reflection
x
=
(
int64_t
)
GsReflect
<
T
>
(
x
,
border
[
0
],
border
[
2
]);
y
=
(
int64_t
)
GsReflect
<
T
>
(
y
,
border
[
1
],
border
[
3
]);
pixel
=
input_data
[
bIdx
*
C
*
H
*
W
+
cIdx
*
H
*
W
+
y
*
W
+
x
];
}
return
pixel
;
}
__device__
void
GsGetCubicCoeffs
(
float
x
,
float
coeffs
[
4
])
{
float
cubic_alpha
=
-
0.75
f
;
x
=
abs
(
x
);
coeffs
[
0
]
=
(((
cubic_alpha
*
(
x
+
1
)
-
5
*
cubic_alpha
)
*
(
x
+
1
)
+
8
*
cubic_alpha
)
*
(
x
+
1
)
-
4
*
cubic_alpha
);
coeffs
[
1
]
=
(((
cubic_alpha
+
2
)
*
x
-
(
cubic_alpha
+
3
))
*
x
*
x
+
1
);
coeffs
[
2
]
=
(((
cubic_alpha
+
2
)
*
(
1
-
x
)
-
(
cubic_alpha
+
3
))
*
(
1
-
x
)
*
(
1
-
x
)
+
1
);
coeffs
[
3
]
=
(((
cubic_alpha
*
(
2
-
x
)
-
5
*
cubic_alpha
)
*
(
2
-
x
)
+
8
*
cubic_alpha
)
*
(
2
-
x
)
-
4
*
cubic_alpha
);
}
template
<
typename
T
>
__device__
T
GsBicubicInterpolate
(
T
p
[
4
][
4
],
float
x
,
float
y
)
{
float
v
[
4
]
=
{};
float
coeffs
[
4
]
=
{};
GsGetCubicCoeffs
(
x
,
coeffs
);
for
(
int64_t
i
=
0
;
i
<
4
;
i
++
)
{
v
[
i
]
=
coeffs
[
0
]
*
p
[
i
][
0
]
+
coeffs
[
1
]
*
p
[
i
][
1
]
+
coeffs
[
2
]
*
p
[
i
][
2
]
+
coeffs
[
3
]
*
p
[
i
][
3
];
}
GsGetCubicCoeffs
(
y
,
coeffs
);
T
pixel
=
static_cast
<
T
>
(
coeffs
[
0
]
*
v
[
0
]
+
coeffs
[
1
]
*
v
[
1
]
+
coeffs
[
2
]
*
v
[
2
]
+
coeffs
[
3
]
*
v
[
3
]);
return
pixel
;
}
template
<
typename
T
>
__global__
void
_GridSampleKernel
(
const
T
*
input_data
,
const
T
*
grid_data
,
const
int64_t
mode
,
const
int64_t
padding_mode
,
const
int64_t
align_corners
,
const
int64_t
N
,
const
int64_t
C
,
const
int64_t
H_in
,
const
int64_t
W_in
,
const
int64_t
H_out
,
const
int64_t
W_out
,
T
*
output_data
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
idx
,
N
*
C
*
H_out
*
W_out
);
// extract batch index, channel index, y index, x index for current thread
int
BIdx
=
idx
/
(
C
*
H_out
*
W_out
);
int
tmpBCnt
=
BIdx
*
(
C
*
H_out
*
W_out
);
int
cIdx
=
(
idx
-
tmpBCnt
)
/
(
H_out
*
W_out
);
int
tmpCCnt
=
tmpBCnt
+
cIdx
*
(
H_out
*
W_out
);
int
yIdx
=
(
idx
-
tmpCCnt
)
/
W_out
;
int
tmpHCnt
=
tmpCCnt
+
yIdx
*
W_out
;
int
xIdx
=
(
idx
-
tmpHCnt
);
int
grid_idx
=
BIdx
*
H_out
*
W_out
+
yIdx
*
W_out
+
xIdx
;
T
grid_X
=
grid_data
[
grid_idx
*
2
+
0
];
T
grid_Y
=
grid_data
[
grid_idx
*
2
+
1
];
int
outIdx
=
idx
;
T
grid_x_imgSpace
=
GsDenormalize
(
grid_X
,
W_in
,
align_corners
==
1
);
T
grid_y_imgSpace
=
GsDenormalize
(
grid_Y
,
H_in
,
align_corners
==
1
);
if
(
mode
==
1
)
{
//nearest
grid_x_imgSpace
=
nearbyint
(
grid_x_imgSpace
);
grid_y_imgSpace
=
nearbyint
(
grid_y_imgSpace
);
}
float
x_min
=
-
0.5
f
;
float
x_max
=
W_in
-
0.5
f
;
float
y_min
=
-
0.5
f
;
float
y_max
=
H_in
-
0.5
f
;
if
(
align_corners
)
{
x_min
=
0.0
f
;
x_max
=
W_in
-
1.0
;
y_min
=
0.0
f
;
y_max
=
H_in
-
1.0
f
;
}
float
border
[]
=
{
x_min
,
y_min
,
x_max
,
y_max
};
// l-t-r-b
if
(
grid_x_imgSpace
<
x_min
||
grid_x_imgSpace
>
x_max
||
grid_y_imgSpace
<
y_min
||
grid_y_imgSpace
>
y_max
)
{
// out of bound
if
(
padding_mode
==
1
)
{
// border
grid_x_imgSpace
=
max
(
0.0
f
,
min
(
grid_x_imgSpace
,
W_in
-
1.0
f
));
grid_y_imgSpace
=
max
(
0.0
f
,
min
(
grid_y_imgSpace
,
H_in
-
1.0
f
));
}
else
if
(
padding_mode
==
2
)
{
// reflection
grid_x_imgSpace
=
GsReflect
(
grid_x_imgSpace
,
x_min
,
x_max
);
grid_y_imgSpace
=
GsReflect
(
grid_y_imgSpace
,
y_min
,
y_max
);
}
}
if
(
mode
==
0
)
{
// bilinear
int
x1
=
floor
(
grid_x_imgSpace
);
int
y1
=
floor
(
grid_y_imgSpace
);
int
x2
=
x1
+
1
;
int
y2
=
y1
+
1
;
T
w_lt
=
0.0
f
;
T
w_rt
=
0.0
f
;
T
w_lb
=
0.0
f
;
T
w_rb
=
0.0
f
;
T
w_r
=
grid_x_imgSpace
-
x1
;
T
w_l
=
1.0
f
-
w_r
;
T
w_b
=
grid_y_imgSpace
-
y1
;
T
w_t
=
1.0
f
-
w_b
;
w_lt
=
w_t
*
w_l
;
w_rt
=
w_t
*
w_r
;
w_lb
=
w_b
*
w_l
;
w_rb
=
w_b
*
w_r
;
T
lt_v
=
PixelAtGrid
(
input_data
,
BIdx
,
cIdx
,
y1
,
x1
,
padding_mode
,
N
,
C
,
H_in
,
W_in
,
border
);
T
rt_v
=
PixelAtGrid
(
input_data
,
BIdx
,
cIdx
,
y1
,
x2
,
padding_mode
,
N
,
C
,
H_in
,
W_in
,
border
);
T
lb_v
=
PixelAtGrid
(
input_data
,
BIdx
,
cIdx
,
y2
,
x1
,
padding_mode
,
N
,
C
,
H_in
,
W_in
,
border
);
T
rb_v
=
PixelAtGrid
(
input_data
,
BIdx
,
cIdx
,
y2
,
x2
,
padding_mode
,
N
,
C
,
H_in
,
W_in
,
border
);
T
interpoV
=
w_lt
*
lt_v
+
w_rt
*
rt_v
+
w_lb
*
lb_v
+
w_rb
*
rb_v
;
output_data
[
outIdx
]
=
interpoV
;
return
;
}
if
(
mode
==
1
)
{
// nearest
int
x_n
=
grid_x_imgSpace
;
int
y_n
=
grid_y_imgSpace
;
output_data
[
outIdx
]
=
PixelAtGrid
(
input_data
,
BIdx
,
cIdx
,
y_n
,
x_n
,
padding_mode
,
N
,
C
,
H_in
,
W_in
,
border
);
return
;
}
if
(
mode
==
2
)
{
// bicubic
int64_t
x0
=
static_cast
<
int64_t
>
(
std
::
floor
(
grid_x_imgSpace
))
-
1
;
// top-left corner of the bbox
int64_t
y0
=
static_cast
<
int64_t
>
(
std
::
floor
(
grid_y_imgSpace
))
-
1
;
T
p
[
4
][
4
]
=
{};
// [H][W]
for
(
int64_t
h
=
0
;
h
<
4
;
h
++
)
{
for
(
int64_t
w
=
0
;
w
<
4
;
w
++
)
{
p
[
h
][
w
]
=
PixelAtGrid
(
input_data
,
BIdx
,
cIdx
,
h
+
y0
,
w
+
x0
,
padding_mode
,
N
,
C
,
H_in
,
W_in
,
border
);
}
}
T
dx
=
grid_x_imgSpace
-
x0
-
1
;
T
dy
=
grid_y_imgSpace
-
y0
-
1
;
output_data
[
outIdx
]
=
GsBicubicInterpolate
(
p
,
dx
,
dy
);
}
}
template
<
typename
T
>
void
GridSampleImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
T
*
grid_data
,
const
int64_t
mode
,
const
int64_t
padding_mode
,
const
int64_t
align_corners
,
const
int64_t
dims
[
4
],
const
int64_t
H_out
,
const
int64_t
W_out
,
T
*
output_data
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
T
>
(
dims
[
0
]
*
dims
[
1
]
*
H_out
*
W_out
)
/
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_GridSampleKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_data
,
grid_data
,
mode
,
padding_mode
,
align_corners
,
dims
[
0
],
dims
[
1
],
dims
[
2
],
dims
[
3
],
H_out
,
W_out
,
output_data
);
}
#define SPECIALIZED_IMPL(T) \
template void GridSampleImpl<T>(hipStream_t stream, const T* input_data, const T* grid_data, \
const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
SPECIALIZED_IMPL
(
float
)
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/grid_sample_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
template
<
typename
T
>
void
GridSampleImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
T
*
grid_data
,
const
int64_t
mode
,
const
int64_t
padding_mode
,
const
int64_t
align_corners
,
const
int64_t
dims_input
[
4
],
const
int64_t
H_out
,
const
int64_t
W_out
,
T
*
output_data
);
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/layer_norm.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/nn/layer_norm.h"
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
// LayerNormalization is an official ONNX operator in opset 17.
#define REGISTER_KERNEL_TYPED(T, U, V) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(LayerNormalization, kOnnxDomain, 1, 16, T##_##U##_##V, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("U", DataTypeImpl::GetTensorType<U>()) \
.TypeConstraint("V", DataTypeImpl::GetTensorType<V>()), \
onnxruntime::rocm::LayerNorm<T, U, V, false>); \
ONNX_OPERATOR_TYPED_KERNEL_EX(SimplifiedLayerNormalization, kOnnxDomain, 1, T##_##U##_##V, kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("U", DataTypeImpl::GetTensorType<U>()) \
.TypeConstraint("V", DataTypeImpl::GetTensorType<V>()), \
onnxruntime::rocm::LayerNorm<T, U, V, true>);
REGISTER_KERNEL_TYPED
(
float
,
float
,
float
)
REGISTER_KERNEL_TYPED
(
double
,
double
,
double
)
REGISTER_KERNEL_TYPED
(
MLFloat16
,
float
,
MLFloat16
)
REGISTER_KERNEL_TYPED
(
float
,
float
,
MLFloat16
)
REGISTER_KERNEL_TYPED
(
MLFloat16
,
float
,
float
)
REGISTER_KERNEL_TYPED
(
BFloat16
,
float
,
BFloat16
)
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "contrib_ops/rocm/math/bias_dropout.h"
#include "core/providers/common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
namespace
{
template
<
typename
T
>
struct
GetRatioDataImpl
{
void
operator
()(
const
Tensor
*
ratio
,
float
&
ratio_data
)
const
{
ratio_data
=
static_cast
<
float
>
(
*
(
ratio
->
Data
<
T
>
()));
ORT_ENFORCE
(
ratio_data
>=
0.0
f
&&
ratio_data
<
1.0
f
,
"ratio_data is outside range [0, 1)"
);
}
};
template
<
typename
T
>
struct
BiasDropoutComputeImpl
{
Status
operator
()(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
int64_t
N
,
const
int64_t
mask_element_count
,
const
fast_divmod
fdm_dim
,
const
float
ratio_data
,
PhiloxGenerator
&
generator
,
const
Tensor
&
X
,
const
Tensor
&
bias
,
const
Tensor
*
residual
,
Tensor
&
Y
,
void
*
mask_data
,
bool
has_same_shape_bias
,
bool
use_bitmask
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
HipT
*
X_data
=
reinterpret_cast
<
const
HipT
*>
(
X
.
Data
<
T
>
());
const
HipT
*
bias_data
=
reinterpret_cast
<
const
HipT
*>
(
bias
.
Data
<
T
>
());
const
HipT
*
residual_data
=
nullptr
;
if
(
residual
)
{
if
(
residual
->
Shape
()
!=
X
.
Shape
())
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"Residual input shape does not match X input shape."
);
}
residual_data
=
reinterpret_cast
<
const
HipT
*>
(
residual
->
Data
<
T
>
());
}
HipT
*
Y_data
=
reinterpret_cast
<
HipT
*>
(
Y
.
MutableData
<
T
>
());
BiasDropoutKernelImpl
<
HipT
>
(
prop
,
stream
,
N
,
mask_element_count
,
fdm_dim
,
ratio_data
,
generator
,
X_data
,
bias_data
,
residual_data
,
Y_data
,
mask_data
,
has_same_shape_bias
,
use_bitmask
);
return
Status
::
OK
();
}
};
}
// namespace
ONNX_OPERATOR_KERNEL_EX
(
BiasDropout
,
kMSDomain
,
1
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
BuildKernelDefConstraints
<
MLFloat16
,
float
,
double
,
BFloat16
>
())
.
TypeConstraint
(
"T1"
,
BuildKernelDefConstraints
<
MLFloat16
,
float
,
double
,
BFloat16
>
())
.
TypeConstraint
(
"T2"
,
DataTypeImpl
::
GetTensorType
<
bool
>
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
3
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
4
),
BiasDropout
<
false
>
);
ONNX_OPERATOR_KERNEL_EX
(
BitmaskBiasDropout
,
kMSDomain
,
1
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
BuildKernelDefConstraints
<
MLFloat16
,
float
,
double
,
BFloat16
>
())
.
TypeConstraint
(
"T1"
,
BuildKernelDefConstraints
<
MLFloat16
,
float
,
double
,
BFloat16
>
())
.
TypeConstraint
(
"T2"
,
DataTypeImpl
::
GetTensorType
<
bool
>
())
.
TypeConstraint
(
"T3"
,
DataTypeImpl
::
GetTensorType
<
BitmaskElementType
>
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
3
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
4
),
BiasDropout
<
true
>
);
template
<
bool
UseBitmask
>
Status
BiasDropout
<
UseBitmask
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
// Get X_data
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
ORT_RETURN_IF_NOT
(
X
,
"X Input is not available."
);
const
TensorShape
&
x_shape
=
X
->
Shape
();
const
int64_t
N
=
x_shape
.
Size
();
// Get bias_data
const
Tensor
*
bias
=
context
->
Input
<
Tensor
>
(
1
);
if
(
!
bias
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"Bias input of BiasDropout is not available."
);
const
TensorShape
&
bias_shape
=
bias
->
Shape
();
const
int64_t
dim
=
bias_shape
.
GetDims
().
back
();
bool
has_same_shape_bias
=
(
bias_shape
==
x_shape
);
if
(
!
has_same_shape_bias
)
{
if
(
bias_shape
.
NumDimensions
()
!=
1
)
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"Bias input is not a 1D tensor."
);
}
if
(
dim
!=
x_shape
.
GetDims
().
back
())
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"Bias' dimension doesn't match input's last dimension."
);
}
}
// Get residual_data
const
Tensor
*
residual
=
context
->
Input
<
Tensor
>
(
2
);
// Get Y_data
auto
Y
=
context
->
Output
(
0
,
x_shape
);
// Get mask_data
Tensor
*
mask
=
nullptr
;
int64_t
mask_element_count
=
N
;
if
(
UseBitmask
)
{
mask_element_count
=
(
N
+
kNumBitsPerBitmaskElement
-
1
)
/
kNumBitsPerBitmaskElement
;
mask
=
context
->
Output
(
1
,
{
mask_element_count
});
}
else
{
mask
=
context
->
Output
(
1
,
x_shape
);
}
// Get the ratio_data
float
ratio_data
=
default_ratio_
;
auto
ratio
=
context
->
Input
<
Tensor
>
(
3
);
if
(
ratio
)
{
utils
::
MLTypeCallDispatcher
<
float
,
MLFloat16
,
double
,
BFloat16
>
t_disp
(
ratio
->
GetElementType
());
t_disp
.
Invoke
<
GetRatioDataImpl
>
(
ratio
,
ratio_data
);
}
// Check for inference mode.
const
Tensor
*
training_mode
=
context
->
Input
<
Tensor
>
(
4
);
bool
is_training_mode
=
training_mode
&&
*
(
training_mode
->
Data
<
bool
>
());
if
(
!
is_training_mode
)
{
ratio_data
=
0.0
f
;
}
IAllocatorUniquePtr
<
void
>
temp_mask_buffer
{};
// buffer to use if mask is not provided
void
*
const
mask_data
=
[
this
,
mask_element_count
,
mask
,
&
temp_mask_buffer
]()
{
if
(
mask
)
return
mask
->
MutableDataRaw
();
temp_mask_buffer
=
GetScratchBuffer
<
void
>
(
mask_element_count
*
(
UseBitmask
?
sizeof
(
BitmaskElementType
)
:
sizeof
(
bool
)));
return
temp_mask_buffer
.
get
();
}();
const
fast_divmod
fdm_dim
(
gsl
::
narrow_cast
<
int
>
(
dim
));
PhiloxGenerator
&
generator
=
generator_
?
*
generator_
:
PhiloxGenerator
::
Default
();
utils
::
MLTypeCallDispatcher
<
float
,
MLFloat16
,
double
,
BFloat16
>
t_disp
(
X
->
GetElementType
());
return
t_disp
.
InvokeRet
<
Status
,
BiasDropoutComputeImpl
>
(
GetDeviceProp
(),
Stream
(),
N
,
mask_element_count
,
fdm_dim
,
ratio_data
,
generator
,
*
X
,
*
bias
,
residual
,
*
Y
,
mask_data
,
has_same_shape_bias
,
UseBitmask
);
}
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/framework/random_generator.h"
using
namespace
onnxruntime
::
rocm
;
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
template
<
typename
T
>
void
BiasDropoutKernelImpl
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
int64_t
N
,
const
int64_t
mask_element_count
,
const
fast_divmod
fdm_dim
,
const
float
ratio
,
PhiloxGenerator
&
generator
,
const
T
*
X_data
,
const
T
*
bias_data
,
const
T
*
residual_data
,
T
*
Y_data
,
void
*
mask_data
,
bool
has_same_shape_bias
,
bool
use_bitmask
);
template
<
bool
UseBitmask
>
class
BiasDropout
final
:
public
RocmKernel
{
public:
BiasDropout
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
seed
=
0
;
if
(
info
.
GetAttr
<
int64_t
>
(
"seed"
,
&
seed
).
IsOK
())
{
generator_
=
std
::
make_unique
<
PhiloxGenerator
>
(
static_cast
<
uint64_t
>
(
seed
));
}
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
mutable
std
::
unique_ptr
<
PhiloxGenerator
>
generator_
;
static
constexpr
float
default_ratio_
=
0.5
f
;
};
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/contrib_ops/rocm/math/bias_dropout_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Modifications Copyright (c) Microsoft. */
#include "contrib_ops/rocm/math/bias_dropout.h"
#include <hiprand_kernel.h>
#include <algorithm>
#include "core/providers/rocm/cu_inc/bitmask.cuh"
namespace
onnxruntime
{
namespace
contrib
{
namespace
rocm
{
constexpr
int
kBlockSize
=
256
;
constexpr
int
kNumUnroll
=
4
;
template
<
typename
T
,
bool
HasSameShapeBias
,
bool
HasResidual
,
bool
UseBitmask
>
__global__
void
BiasDropoutKernel
(
const
HIP_LONG
N
,
const
HIP_LONG
mask_element_count
,
const
int
step_size
,
const
int
steps_per_thread
,
const
fast_divmod
fdm_bits_per_element
,
const
fast_divmod
fdm_dim
,
const
float
ratio
,
const
std
::
pair
<
uint64_t
,
uint64_t
>
seeds
,
const
T
*
X_data
,
const
T
*
bias_data
,
const
T
*
residual_data
,
T
*
Y_data
,
void
*
mask_data
)
{
HIP_LONG
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
float
p
=
1.0
f
-
ratio
;
const
float
scale
=
1.0
f
/
p
;
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seeds
.
first
,
idx
,
seeds
.
second
,
&
state
);
float4
rand
;
// We ensure every thread generates the same number of random numbers (by rounding
// up the size) and at the same timestep (by syncing threads).
// From ROCM hiprand documentation:
// The Philox_4x32_10 algorithm is closely tied to the thread and block count.
// Each thread computes 4 random numbers in the same time thus the most efficient
// use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
for
(
int
i
=
0
;
i
<
steps_per_thread
;
++
i
)
{
HIP_LONG
id
=
idx
*
kNumUnroll
+
i
*
step_size
;
rand
=
hiprand_uniform4
(
&
state
);
BitmaskElementType
thread_bitmask
=
0
;
// actual computation
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumUnroll
;
++
i
)
{
HIP_LONG
li
=
id
+
i
;
if
(
li
<
N
)
{
float
bias
;
if
(
HasSameShapeBias
)
{
bias
=
static_cast
<
float
>
(
bias_data
[
li
]);
}
else
{
int
offset
=
fdm_dim
.
mod
(
li
);
bias
=
static_cast
<
float
>
(
bias_data
[
offset
]);
}
bool
mask
=
(
&
rand
.
x
)[
i
]
<
p
;
float
output_data
=
(
static_cast
<
float
>
(
X_data
[
li
])
+
bias
)
*
mask
*
scale
;
if
(
HasResidual
)
{
output_data
+=
static_cast
<
float
>
(
residual_data
[
li
]);
}
Y_data
[
li
]
=
static_cast
<
T
>
(
output_data
);
if
(
UseBitmask
)
{
thread_bitmask
|=
(
mask
<<
i
);
}
else
{
reinterpret_cast
<
bool
*>
(
mask_data
)[
li
]
=
mask
;
}
}
}
if
(
UseBitmask
)
{
SetBitmask
<
kNumUnroll
>
(
id
,
mask_element_count
,
fdm_bits_per_element
,
thread_bitmask
,
reinterpret_cast
<
BitmaskElementType
*>
(
mask_data
));
}
__syncthreads
();
}
}
template
<
typename
T
,
bool
HasSameShapeBias
,
bool
HasResidual
,
bool
UseBitmask
>
__global__
void
BiasDropoutVectorizedKernel
(
const
HIP_LONG
N
,
const
HIP_LONG
mask_element_count
,
const
int
step_size
,
const
int
steps_per_thread
,
const
fast_divmod
fdm_bits_per_element
,
const
fast_divmod
fdm_dim
,
const
float
ratio
,
const
std
::
pair
<
uint64_t
,
uint64_t
>
seeds
,
const
T
*
X_data
,
const
T
*
bias_data
,
const
T
*
residual_data
,
T
*
Y_data
,
void
*
mask_data
)
{
HIP_LONG
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
const
float
p
=
1.0
f
-
ratio
;
const
float
scale
=
1.0
f
/
p
;
hiprandStatePhilox4_32_10_t
state
;
hiprand_init
(
seeds
.
first
,
idx
,
seeds
.
second
,
&
state
);
float4
rand
;
// using vectorized data load/store approach when N % 4 == 0
// since this is typical case for input shape size
using
LoadT
=
aligned_vector
<
T
,
kNumUnroll
>
;
using
MaskLoadT
=
aligned_vector
<
bool
,
kNumUnroll
>
;
using
ResidualLoadT
=
aligned_vector
<
T
,
kNumUnroll
>
;
for
(
int
i
=
0
;
i
<
steps_per_thread
;
++
i
)
{
HIP_LONG
id
=
idx
*
kNumUnroll
+
i
*
step_size
;
rand
=
hiprand_uniform4
(
&
state
);
BitmaskElementType
thread_bitmask
=
0
;
if
(
id
<
N
)
{
// vectorized load into storage
T
bias_vec
[
kNumUnroll
];
if
(
HasSameShapeBias
)
{
LoadT
*
value0
=
reinterpret_cast
<
LoadT
*>
(
&
bias_vec
);
*
value0
=
*
reinterpret_cast
<
const
LoadT
*>
(
&
bias_data
[
id
]);
}
T
src
[
kNumUnroll
];
LoadT
*
value1
=
reinterpret_cast
<
LoadT
*>
(
&
src
);
*
value1
=
*
reinterpret_cast
<
const
LoadT
*>
(
&
X_data
[
id
]);
T
residual
[
kNumUnroll
];
if
(
HasResidual
)
{
ResidualLoadT
*
value2
=
reinterpret_cast
<
ResidualLoadT
*>
(
&
residual
);
*
value2
=
*
reinterpret_cast
<
const
ResidualLoadT
*>
(
&
residual_data
[
id
]);
}
T
r
[
kNumUnroll
];
bool
masks
[
kNumUnroll
];
// actual computation
#pragma unroll
for
(
int
ii
=
0
;
ii
<
kNumUnroll
;
ii
++
)
{
float
bias
;
if
(
HasSameShapeBias
)
{
bias
=
static_cast
<
float
>
(
bias_vec
[
ii
]);
}
else
{
int
offset
=
fdm_dim
.
mod
(
id
+
ii
);
bias
=
static_cast
<
float
>
(
bias_data
[
offset
]);
}
bool
mask
=
(
&
rand
.
x
)[
ii
]
<
p
;
float
output_data
=
(
static_cast
<
float
>
(
src
[
ii
])
+
bias
)
*
mask
*
scale
;
if
(
HasResidual
)
{
output_data
+=
static_cast
<
float
>
(
residual
[
ii
]);
}
r
[
ii
]
=
static_cast
<
T
>
(
output_data
);
if
(
UseBitmask
)
{
thread_bitmask
|=
(
mask
<<
ii
);
}
else
{
masks
[
ii
]
=
mask
;
}
}
// Vectorized writes for mask_data & Y_data
*
(
reinterpret_cast
<
LoadT
*>
(
&
Y_data
[
id
]))
=
*
reinterpret_cast
<
LoadT
*>
(
&
r
[
0
]);
if
(
!
UseBitmask
)
{
*
(
reinterpret_cast
<
MaskLoadT
*>
(
&
reinterpret_cast
<
bool
*>
(
mask_data
)[
id
]))
=
*
reinterpret_cast
<
MaskLoadT
*>
(
&
masks
[
0
]);
}
}
if
(
UseBitmask
)
{
SetBitmask
<
kNumUnroll
>
(
id
,
mask_element_count
,
fdm_bits_per_element
,
thread_bitmask
,
reinterpret_cast
<
BitmaskElementType
*>
(
mask_data
));
}
__syncthreads
();
}
}
#define LAUNCH_BIAS_DROPOUT_KERNEL(FuncName, HasSameShapeBias, HasResidual, UseBitmask) \
hipLaunchKernelGGL(HIP_KERNEL_NAME(FuncName<T, HasSameShapeBias, HasResidual, UseBitmask>), grid_size, kBlockSize, 0, stream, \
static_cast<HIP_LONG>(N), static_cast<HIP_LONG>(mask_element_count), step_size, steps_per_thread, \
fdm_bits_per_element, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data)
#define HANDLE_BIAS_DROPOUT_USE_BITMASK(FuncName, HasSameShapeBias, HasResidual) \
if (use_bitmask) { \
LAUNCH_BIAS_DROPOUT_KERNEL(FuncName, HasSameShapeBias, HasResidual, true); \
} else { \
LAUNCH_BIAS_DROPOUT_KERNEL(FuncName, HasSameShapeBias, HasResidual, false); \
}
#define HANDLE_BIAS_DROPOUT_HAS_RESIDUAL(FuncName, HasSameShapeBias) \
if (residual_data) { \
HANDLE_BIAS_DROPOUT_USE_BITMASK(FuncName, HasSameShapeBias, true); \
} else { \
HANDLE_BIAS_DROPOUT_USE_BITMASK(FuncName, HasSameShapeBias, false); \
}
#define HANDLE_BIAS_DROPOUT_HAS_SAME_SHAPE_BIAS(FuncName) \
if (has_same_shape_bias) { \
HANDLE_BIAS_DROPOUT_HAS_RESIDUAL(FuncName, true); \
} else { \
HANDLE_BIAS_DROPOUT_HAS_RESIDUAL(FuncName, false); \
}
template
<
typename
T
>
void
BiasDropoutKernelImpl
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
int64_t
N
,
const
int64_t
mask_element_count
,
const
fast_divmod
fdm_dim
,
const
float
ratio
,
PhiloxGenerator
&
generator
,
const
T
*
X_data
,
const
T
*
bias_data
,
const
T
*
residual_data
,
T
*
Y_data
,
void
*
mask_data
,
bool
has_same_shape_bias
,
bool
use_bitmask
)
{
const
int
blocks_per_sm
=
prop
.
maxThreadsPerMultiProcessor
/
kBlockSize
;
const
int
grid_size
=
std
::
min
(
prop
.
multiProcessorCount
*
blocks_per_sm
,
static_cast
<
int
>
(
CeilDiv
(
N
,
kBlockSize
*
kNumUnroll
)));
// Compute the number of random numbers generated by each thread, and increment philox generator offset by that
// amount.
const
int
step_size
=
kBlockSize
*
grid_size
*
kNumUnroll
;
const
int
steps_per_thread
=
static_cast
<
int
>
(
CeilDiv
(
N
,
step_size
));
auto
seeds
=
generator
.
NextPhiloxSeeds
(
static_cast
<
uint64_t
>
(
steps_per_thread
*
kNumUnroll
));
fast_divmod
fdm_bits_per_element
(
kNumBitsPerBitmaskElement
);
if
(
N
%
kNumUnroll
!=
0
)
{
HANDLE_BIAS_DROPOUT_HAS_SAME_SHAPE_BIAS
(
BiasDropoutKernel
);
}
else
{
HANDLE_BIAS_DROPOUT_HAS_SAME_SHAPE_BIAS
(
BiasDropoutVectorizedKernel
);
}
}
#undef HANDLE_BIAS_DROPOUT_HAS_SAME_SHAPE_BIAS
#undef HANDLE_BIAS_DROPOUT_HAS_RESIDUAL
#undef HANDLE_BIAS_DROPOUT_USE_BITMASK
#undef LAUNCH_BIAS_DROPOUT_KERNEL
#define SPECIALIZED_BIAS_DROPOUT_IMPL(T) \
template void BiasDropoutKernelImpl<T>( \
const hipDeviceProp_t& prop, hipStream_t stream, const int64_t N, const int64_t mask_element_count, \
const fast_divmod fdm_dim, const float ratio, PhiloxGenerator& generator, const T* X_data, const T* bias_data, \
const T* residual_data, T* Y_data, void* mask_data, bool has_same_shape_bias, bool use_bitmask);
SPECIALIZED_BIAS_DROPOUT_IMPL
(
float
)
SPECIALIZED_BIAS_DROPOUT_IMPL
(
double
)
SPECIALIZED_BIAS_DROPOUT_IMPL
(
half
)
SPECIALIZED_BIAS_DROPOUT_IMPL
(
BFloat16
)
#undef SPECIALIZED_BIAS_DROPOUT_IMPL
}
// namespace rocm
}
// namespace contrib
}
// namespace onnxruntime
Prev
1
2
3
4
5
6
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment