Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Uni-Fold_pytorch
Commits
a1c29028
Commit
a1c29028
authored
Apr 17, 2023
by
zhangqha
Browse files
update uni-fold
parents
Pipeline
#183
canceled with stages
Changes
312
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1981 additions
and
0 deletions
+1981
-0
Uni-Core-main/unicore/losses/cross_entropy.py
Uni-Core-main/unicore/losses/cross_entropy.py
+65
-0
Uni-Core-main/unicore/losses/masked_lm.py
Uni-Core-main/unicore/losses/masked_lm.py
+67
-0
Uni-Core-main/unicore/losses/unicore_loss.py
Uni-Core-main/unicore/losses/unicore_loss.py
+78
-0
Uni-Core-main/unicore/models/__init__.py
Uni-Core-main/unicore/models/__init__.py
+120
-0
Uni-Core-main/unicore/models/distributed_unicore_model.py
Uni-Core-main/unicore/models/distributed_unicore_model.py
+67
-0
Uni-Core-main/unicore/models/unicore_model.py
Uni-Core-main/unicore/models/unicore_model.py
+58
-0
Uni-Core-main/unicore/modules/__init__.py
Uni-Core-main/unicore/modules/__init__.py
+9
-0
Uni-Core-main/unicore/modules/layer_norm.py
Uni-Core-main/unicore/modules/layer_norm.py
+82
-0
Uni-Core-main/unicore/modules/multihead_attention.py
Uni-Core-main/unicore/modules/multihead_attention.py
+222
-0
Uni-Core-main/unicore/modules/softmax_dropout.py
Uni-Core-main/unicore/modules/softmax_dropout.py
+144
-0
Uni-Core-main/unicore/modules/transformer_decoder.py
Uni-Core-main/unicore/modules/transformer_decoder.py
+169
-0
Uni-Core-main/unicore/modules/transformer_decoder_layer.py
Uni-Core-main/unicore/modules/transformer_decoder_layer.py
+123
-0
Uni-Core-main/unicore/modules/transformer_encoder.py
Uni-Core-main/unicore/modules/transformer_encoder.py
+164
-0
Uni-Core-main/unicore/modules/transformer_encoder_layer.py
Uni-Core-main/unicore/modules/transformer_encoder_layer.py
+100
-0
Uni-Core-main/unicore/nan_detector.py
Uni-Core-main/unicore/nan_detector.py
+109
-0
Uni-Core-main/unicore/optim/__init__.py
Uni-Core-main/unicore/optim/__init__.py
+40
-0
Uni-Core-main/unicore/optim/adadelta.py
Uni-Core-main/unicore/optim/adadelta.py
+48
-0
Uni-Core-main/unicore/optim/adagrad.py
Uni-Core-main/unicore/optim/adagrad.py
+41
-0
Uni-Core-main/unicore/optim/adam.py
Uni-Core-main/unicore/optim/adam.py
+204
-0
Uni-Core-main/unicore/optim/dynamic_loss_scaler.py
Uni-Core-main/unicore/optim/dynamic_loss_scaler.py
+71
-0
No files found.
Uni-Core-main/unicore/losses/cross_entropy.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
math
import
torch
import
torch.nn.functional
as
F
from
unicore
import
metrics
from
unicore.losses
import
UnicoreLoss
,
register_loss
@
register_loss
(
"cross_entropy"
)
class
CrossEntropyLoss
(
UnicoreLoss
):
def
__init__
(
self
,
task
):
super
().
__init__
(
task
)
def
forward
(
self
,
model
,
sample
,
reduce
=
True
):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
net_output
=
model
(
**
sample
[
"net_input"
])
loss
=
self
.
compute_loss
(
model
,
net_output
,
sample
,
reduce
=
reduce
)
sample_size
=
sample
[
"target"
].
size
(
0
)
logging_output
=
{
"loss"
:
loss
.
data
,
"bsz"
:
sample
[
"target"
].
size
(
0
),
"sample_size"
:
sample_size
,
}
return
loss
,
sample_size
,
logging_output
def
compute_loss
(
self
,
model
,
net_output
,
sample
,
reduce
=
True
):
lprobs
=
F
.
log_softmax
(
net_output
.
float
(),
dim
=-
1
)
lprobs
=
lprobs
.
view
(
-
1
,
lprobs
.
size
(
-
1
))
target
=
sample
[
'target'
].
view
(
-
1
)
loss
=
F
.
nll_loss
(
lprobs
,
target
,
reduction
=
"sum"
if
reduce
else
"none"
,
)
return
loss
@
staticmethod
def
reduce_metrics
(
logging_outputs
,
split
=
'valid'
)
->
None
:
"""Aggregate logging outputs from data parallel training."""
loss_sum
=
sum
(
log
.
get
(
"loss"
,
0
)
for
log
in
logging_outputs
)
sample_size
=
sum
(
log
.
get
(
"sample_size"
,
0
)
for
log
in
logging_outputs
)
# we divide by log(2) to convert the loss from base e to base 2
metrics
.
log_scalar
(
"loss"
,
loss_sum
/
sample_size
/
math
.
log
(
2
),
sample_size
,
round
=
3
)
@
staticmethod
def
logging_outputs_can_be_summed
(
is_train
)
->
bool
:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return
True
Uni-Core-main/unicore/losses/masked_lm.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
math
import
torch
import
torch.nn.functional
as
F
from
unicore
import
metrics
from
unicore.losses
import
UnicoreLoss
,
register_loss
@
register_loss
(
"masked_lm"
)
class
MaskedLMLoss
(
UnicoreLoss
):
def
__init__
(
self
,
task
):
super
().
__init__
(
task
)
self
.
padding_idx
=
task
.
dictionary
.
pad
()
def
forward
(
self
,
model
,
sample
,
reduce
=
True
):
masked_tokens
=
sample
[
"target"
].
ne
(
self
.
padding_idx
)
sample_size
=
masked_tokens
.
int
().
sum
()
masked_tokens
=
torch
.
where
(
masked_tokens
.
any
(),
masked_tokens
,
masked_tokens
.
new
([
True
]),
)
logits
=
model
(
**
sample
[
"net_input"
],
masked_tokens
=
masked_tokens
)
target
=
sample
[
'target'
]
if
masked_tokens
is
not
None
:
target
=
target
[
masked_tokens
]
loss
=
F
.
nll_loss
(
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
),
target
,
ignore_index
=
self
.
padding_idx
,
reduction
=
'sum'
,
)
logging_output
=
{
"loss"
:
loss
.
data
,
"bsz"
:
sample
[
"target"
].
size
(
0
),
"sample_size"
:
sample_size
,
"seq_len"
:
sample
[
"target"
].
size
(
1
)
*
sample
[
"target"
].
size
(
0
),
}
return
loss
,
sample_size
,
logging_output
@
staticmethod
def
reduce_metrics
(
logging_outputs
,
split
=
'valid'
)
->
None
:
"""Aggregate logging outputs from data parallel training."""
loss_sum
=
sum
(
log
.
get
(
"loss"
,
0
)
for
log
in
logging_outputs
)
bsz
=
sum
(
log
.
get
(
"bsz"
,
0
)
for
log
in
logging_outputs
)
sample_size
=
sum
(
log
.
get
(
"sample_size"
,
0
)
for
log
in
logging_outputs
)
seq_len
=
sum
(
log
.
get
(
"seq_len"
,
0
)
for
log
in
logging_outputs
)
# we divide by log(2) to convert the loss from base e to base 2
metrics
.
log_scalar
(
"loss"
,
loss_sum
/
sample_size
/
math
.
log
(
2
),
sample_size
,
round
=
3
)
metrics
.
log_scalar
(
"seq_len"
,
seq_len
/
bsz
,
1
,
round
=
3
)
@
staticmethod
def
logging_outputs_can_be_summed
(
is_train
)
->
bool
:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return
True
Uni-Core-main/unicore/losses/unicore_loss.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
inspect
from
typing
import
Any
,
Dict
,
List
from
unicore
import
metrics
,
utils
from
torch.nn.modules.loss
import
_Loss
class
UnicoreLoss
(
_Loss
):
def
__init__
(
self
,
task
):
super
().
__init__
()
self
.
task
=
task
if
task
is
not
None
:
self
.
args
=
task
.
args
if
hasattr
(
task
,
"target_dictionary"
):
tgt_dict
=
task
.
target_dictionary
self
.
padding_idx
=
tgt_dict
.
pad
()
if
tgt_dict
is
not
None
else
-
100
@
classmethod
def
add_args
(
cls
,
parser
):
pass
@
classmethod
def
build_loss
(
cls
,
args
,
task
):
"""Construct a loss from command-line args."""
# arguments in the __init__.
init_args
=
{}
for
p
in
inspect
.
signature
(
cls
).
parameters
.
values
():
if
(
p
.
kind
==
p
.
POSITIONAL_ONLY
or
p
.
kind
==
p
.
VAR_POSITIONAL
or
p
.
kind
==
p
.
VAR_KEYWORD
):
# we haven't implemented inference for these argument types,
# but PRs welcome :)
raise
NotImplementedError
(
"{} not supported"
.
format
(
p
.
kind
))
assert
p
.
kind
in
{
p
.
POSITIONAL_OR_KEYWORD
,
p
.
KEYWORD_ONLY
}
if
p
.
name
==
"task"
:
init_args
[
"task"
]
=
task
elif
p
.
name
==
"args"
:
init_args
[
"args"
]
=
args
elif
hasattr
(
args
,
p
.
name
):
init_args
[
p
.
name
]
=
getattr
(
args
,
p
.
name
)
elif
p
.
default
!=
p
.
empty
:
pass
# we'll use the default value
else
:
raise
NotImplementedError
(
"Unable to infer Loss arguments, please implement "
"{}.build_loss"
.
format
(
cls
.
__name__
)
)
return
cls
(
**
init_args
)
def
forward
(
self
,
model
,
sample
,
reduce
=
True
):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
raise
NotImplementedError
@
staticmethod
def
logging_outputs_can_be_summed
(
is_train
:
bool
)
->
bool
:
"""
Whether the logging outputs returned by `forward` can be summed
across workers prior to calling `reduce_metrics`. Setting this
to True will improves distributed training speed.
"""
return
False
Uni-Core-main/unicore/models/__init__.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""isort:skip_file"""
import
argparse
import
importlib
import
os
from
.distributed_unicore_model
import
DistributedUnicoreModel
from
.unicore_model
import
(
BaseUnicoreModel
,
)
MODEL_REGISTRY
=
{}
ARCH_MODEL_REGISTRY
=
{}
ARCH_MODEL_NAME_REGISTRY
=
{}
ARCH_MODEL_INV_REGISTRY
=
{}
ARCH_CONFIG_REGISTRY
=
{}
__all__
=
[
"BaseUnicoreModel"
,
"DistributedUnicoreModel"
,
]
def
build_model
(
args
,
task
):
return
ARCH_MODEL_REGISTRY
[
args
.
arch
].
build_model
(
args
,
task
)
def
register_model
(
name
):
"""
New model types can be added to unicore with the :func:`register_model`
function decorator.
For example::
@register_model("lstm")
class LSTM(UnicoreEncoderDecoderModel):
(...)
.. note:: All models must implement the :class:`BaseUnicoreModel` interface.
Typically you will extend :class:`UnicoreEncoderDecoderModel` for
sequence-to-sequence tasks or :class:`UnicoreLanguageModel` for
language modeling tasks.
Args:
name (str): the name of the model
"""
def
register_model_cls
(
cls
):
if
name
in
MODEL_REGISTRY
:
raise
ValueError
(
"Cannot register duplicate model ({})"
.
format
(
name
))
if
not
issubclass
(
cls
,
BaseUnicoreModel
):
raise
ValueError
(
"Model ({}: {}) must extend BaseUnicoreModel"
.
format
(
name
,
cls
.
__name__
))
MODEL_REGISTRY
[
name
]
=
cls
return
cls
return
register_model_cls
def
register_model_architecture
(
model_name
,
arch_name
):
"""
New model architectures can be added to unicore with the
:func:`register_model_architecture` function decorator. After registration,
model architectures can be selected with the ``--arch`` command-line
argument.
For example::
@register_model_architecture("lstm", "lstm_luong_wmt_en_de")
def lstm_luong_wmt_en_de(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1000)
(...)
The decorated function should take a single argument *args*, which is a
:class:`argparse.Namespace` of arguments parsed from the command-line. The
decorated function should modify these arguments in-place to match the
desired architecture.
Args:
model_name (str): the name of the Model (Model must already be
registered)
arch_name (str): the name of the model architecture (``--arch``)
"""
def
register_model_arch_fn
(
fn
):
if
model_name
not
in
MODEL_REGISTRY
:
raise
ValueError
(
"Cannot register model architecture for unknown model type ({})"
.
format
(
model_name
))
if
arch_name
in
ARCH_MODEL_REGISTRY
:
raise
ValueError
(
"Cannot register duplicate model architecture ({})"
.
format
(
arch_name
))
if
not
callable
(
fn
):
raise
ValueError
(
"Model architecture must be callable ({})"
.
format
(
arch_name
))
ARCH_MODEL_REGISTRY
[
arch_name
]
=
MODEL_REGISTRY
[
model_name
]
ARCH_MODEL_INV_REGISTRY
.
setdefault
(
model_name
,
[]).
append
(
arch_name
)
ARCH_CONFIG_REGISTRY
[
arch_name
]
=
fn
return
fn
return
register_model_arch_fn
# automatically import any Python files in the models/ directory
models_dir
=
os
.
path
.
dirname
(
__file__
)
for
file
in
os
.
listdir
(
models_dir
):
path
=
os
.
path
.
join
(
models_dir
,
file
)
if
not
file
.
startswith
(
"_"
)
and
not
file
.
startswith
(
"."
)
and
(
file
.
endswith
(
".py"
)
or
os
.
path
.
isdir
(
path
)):
model_name
=
file
[:
file
.
find
(
".py"
)]
if
file
.
endswith
(
".py"
)
else
file
module
=
importlib
.
import_module
(
"unicore.models."
+
model_name
)
# extra `model_parser` for sphinx
if
model_name
in
MODEL_REGISTRY
:
parser
=
argparse
.
ArgumentParser
(
add_help
=
False
)
group_archs
=
parser
.
add_argument_group
(
"Named architectures"
)
group_archs
.
add_argument
(
"--arch"
,
choices
=
ARCH_MODEL_INV_REGISTRY
[
model_name
])
group_args
=
parser
.
add_argument_group
(
"Additional command-line arguments"
)
MODEL_REGISTRY
[
model_name
].
add_args
(
group_args
)
globals
()[
model_name
+
"_parser"
]
=
parser
Uni-Core-main/unicore/models/distributed_unicore_model.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
logging
import
torch
import
torch.nn
as
nn
from
torch.nn.parallel
import
DistributedDataParallel
from
unicore.distributed
import
(
ModuleProxyWrapper
,
LegacyDistributedDataParallel
)
logger
=
logging
.
getLogger
(
__name__
)
def
DistributedUnicoreModel
(
args
,
model
,
process_group
,
device
):
"""
Wrap a *model* to support distributed data parallel training.
This is similar to the built-in DistributedDataParallel, but allows
additional configuration of the DistributedDataParallel class to
use, and also provides easier access to the wrapped model by
forwarding requests for missing attributes to the wrapped model.
Args:
args (argparse.Namespace): unicore args
model (BaseUnicoreModel): model to wrap
process_group: the c10d process group to be used for distributed data
parallel all-reduction.
device: device to move model to
"""
assert
isinstance
(
model
,
nn
.
Module
)
if
args
.
ddp_backend
in
{
"c10d"
,
"pytorch_ddp"
}:
wrapped_model
=
DistributedDataParallel
(
module
=
model
.
to
(
device
),
device_ids
=
[
args
.
device_id
],
output_device
=
args
.
device_id
,
broadcast_buffers
=
args
.
broadcast_buffers
,
bucket_cap_mb
=
args
.
bucket_cap_mb
,
process_group
=
process_group
,
find_unused_parameters
=
args
.
find_unused_parameters
,
)
# forward missing getattr and state_dict/load_state_dict to orig model
wrapped_model
=
ModuleProxyWrapper
(
wrapped_model
)
elif
args
.
ddp_backend
in
{
'apex'
}:
import
apex
wrapped_model
=
apex
.
parallel
.
DistributedDataParallel
(
module
=
model
.
to
(
device
)
)
# forward missing getattr and state_dict/load_state_dict to orig model
wrapped_model
=
ModuleProxyWrapper
(
wrapped_model
)
elif
args
.
ddp_backend
in
{
"no_c10d"
,
"legacy_ddp"
}:
wrapped_model
=
LegacyDistributedDataParallel
(
module
=
model
.
to
(
device
),
buffer_size
=
2
**
28
,
process_group
=
process_group
,
)
# forward missing getattr and state_dict/load_state_dict to orig model
wrapped_model
=
ModuleProxyWrapper
(
wrapped_model
)
else
:
raise
ValueError
(
"Unknown --ddp-backend: "
+
args
.
ddp_backend
)
return
wrapped_model
Uni-Core-main/unicore/models/unicore_model.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Base classes for various unicore models.
"""
import
logging
import
torch
import
torch.nn
as
nn
logger
=
logging
.
getLogger
(
__name__
)
class
BaseUnicoreModel
(
nn
.
Module
):
"""Base class for unicore models."""
def
__init__
(
self
):
super
().
__init__
()
@
classmethod
def
add_args
(
cls
,
parser
):
"""Add model-specific arguments to the parser."""
pass
@
classmethod
def
build_model
(
cls
,
args
,
task
):
"""Build a new model instance."""
raise
NotImplementedError
(
"Model must implement the build_model method"
)
def
extract_features
(
self
,
*
args
,
**
kwargs
):
"""Similar to *forward* but only return features."""
return
self
(
*
args
,
**
kwargs
)
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
,
model_args
=
None
,
):
"""Copies parameters and buffers from *state_dict* into this module and
its descendants.
Overrides the method in :class:`nn.Module`.
"""
return
super
().
load_state_dict
(
state_dict
,
strict
)
def
set_num_updates
(
self
,
num_updates
):
"""State from trainer to pass along to model at every update."""
def
_apply
(
m
):
if
hasattr
(
m
,
"set_num_updates"
)
and
m
!=
self
:
m
.
set_num_updates
(
num_updates
)
self
.
apply
(
_apply
)
Uni-Core-main/unicore/modules/__init__.py
0 → 100644
View file @
a1c29028
"""isort:skip_file"""
from
.layer_norm
import
LayerNorm
from
.softmax_dropout
import
softmax_dropout
from
.multihead_attention
import
SelfMultiheadAttention
,
CrossMultiheadAttention
from
.transformer_encoder_layer
import
TransformerEncoderLayer
from
.transformer_encoder
import
TransformerEncoder
,
init_bert_params
,
relative_position_bucket
from
.transformer_decoder_layer
import
TransformerDecoderLayer
from
.transformer_decoder
import
TransformerDecoder
Uni-Core-main/unicore/modules/layer_norm.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch
import
numbers
from
torch.nn.parameter
import
Parameter
from
torch.nn
import
init
from
torch.nn
import
functional
as
F
try
:
import
unicore_fused_layernorm
import
unicore_fused_layernorm_backward_gamma_beta
HAS_LAYER_NORM
=
True
except
:
print
(
"fused_layer_norm is not installed corrected"
)
HAS_LAYER_NORM
=
False
if
not
torch
.
cuda
.
is_available
()
or
torch
.
cuda
.
get_device_capability
()[
0
]
<
7
:
HAS_LAYER_NORM
=
False
class
FusedLayerNormFastFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
input
,
weight
,
bias
,
normalized_shape
,
eps
):
ctx
.
normalized_shape
=
normalized_shape
ctx
.
eps
=
eps
input
=
input
.
contiguous
()
weight
=
weight
.
contiguous
()
bias
=
bias
.
contiguous
()
output
,
mean
,
invvar
=
unicore_fused_layernorm
.
forward
(
input
,
ctx
.
normalized_shape
,
weight
,
bias
,
ctx
.
eps
)
ctx
.
save_for_backward
(
input
,
weight
,
bias
,
mean
,
invvar
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input_
,
weight_
,
bias_
,
mean
,
invvar
=
ctx
.
saved_tensors
grad_input
=
grad_weight
=
grad_bias
=
None
grad_input
=
unicore_fused_layernorm
.
backward
(
grad_output
.
contiguous
(),
mean
,
invvar
,
input_
,
ctx
.
normalized_shape
,
weight_
,
bias_
,
ctx
.
eps
)
grad_weight
,
grad_bias
=
unicore_fused_layernorm_backward_gamma_beta
.
backward
(
grad_output
.
contiguous
(),
mean
,
invvar
,
input_
,
ctx
.
normalized_shape
,
weight_
,
bias_
,
ctx
.
eps
)
return
grad_input
,
grad_weight
,
grad_bias
,
None
,
None
FUSED_LAYER_NORM_SUPPORT_DIM
=
set
([
64
,
128
,
256
,
320
,
384
,
512
,
640
,
768
,
1024
,
1280
,
1536
,
1792
,
2048
,
2560
,
5120
])
class
LayerNorm
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
normalized_shape
,
eps
=
1e-5
,
elementwise_affine
=
True
):
super
(
LayerNorm
,
self
).
__init__
()
if
isinstance
(
normalized_shape
,
numbers
.
Integral
):
normalized_shape
=
(
normalized_shape
,)
self
.
normalized_shape
=
torch
.
Size
(
normalized_shape
)
self
.
eps
=
eps
assert
elementwise_affine
self
.
weight
=
Parameter
(
torch
.
Tensor
(
*
normalized_shape
))
self
.
bias
=
Parameter
(
torch
.
Tensor
(
*
normalized_shape
))
self
.
reset_parameters
()
def
torch_layer_norm
(
input
):
return
F
.
layer_norm
(
input
,
self
.
normalized_shape
,
self
.
weight
.
type
(
input
.
dtype
),
self
.
bias
.
type
(
input
.
dtype
),
self
.
eps
)
def
fused_layer_norm
(
input
):
if
input
.
is_cuda
:
return
FusedLayerNormFastFunction
.
apply
(
input
,
self
.
weight
.
type
(
input
.
dtype
),
self
.
bias
.
type
(
input
.
dtype
),
self
.
normalized_shape
,
self
.
eps
)
else
:
return
F
.
layer_norm
(
input
,
self
.
normalized_shape
,
self
.
weight
.
type
(
input
.
dtype
),
self
.
bias
.
type
(
input
.
dtype
),
self
.
eps
)
self
.
func
=
torch_layer_norm
if
(
not
HAS_LAYER_NORM
or
normalized_shape
[
0
]
not
in
FUSED_LAYER_NORM_SUPPORT_DIM
)
else
fused_layer_norm
def
reset_parameters
(
self
):
init
.
ones_
(
self
.
weight
)
init
.
zeros_
(
self
.
bias
)
def
forward
(
self
,
input
):
return
self
.
func
(
input
)
def
extra_repr
(
self
):
return
'{normalized_shape}, eps={eps}, '
\
'elementwise_affine=True'
.
format
(
**
self
.
__dict__
)
Uni-Core-main/unicore/modules/multihead_attention.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
typing
import
Dict
,
Optional
import
torch
from
torch
import
Tensor
,
nn
from
.softmax_dropout
import
softmax_dropout
class
SelfMultiheadAttention
(
nn
.
Module
):
def
__init__
(
self
,
embed_dim
,
num_heads
,
dropout
=
0.1
,
bias
=
True
,
scaling_factor
=
1
,
):
super
().
__init__
()
self
.
embed_dim
=
embed_dim
self
.
num_heads
=
num_heads
self
.
dropout
=
dropout
self
.
head_dim
=
embed_dim
//
num_heads
assert
(
self
.
head_dim
*
num_heads
==
self
.
embed_dim
),
"embed_dim must be divisible by num_heads"
self
.
scaling
=
(
self
.
head_dim
*
scaling_factor
)
**
-
0.5
self
.
in_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
*
3
,
bias
=
bias
)
self
.
out_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
def
forward
(
self
,
query
,
key_padding_mask
:
Optional
[
Tensor
]
=
None
,
attn_bias
:
Optional
[
Tensor
]
=
None
,
return_attn
:
bool
=
False
,
)
->
Tensor
:
bsz
,
tgt_len
,
embed_dim
=
query
.
size
()
assert
embed_dim
==
self
.
embed_dim
q
,
k
,
v
=
self
.
in_proj
(
query
).
chunk
(
3
,
dim
=-
1
)
q
=
(
q
.
view
(
bsz
,
tgt_len
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
*
self
.
num_heads
,
-
1
,
self
.
head_dim
)
*
self
.
scaling
)
if
k
is
not
None
:
k
=
(
k
.
view
(
bsz
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
*
self
.
num_heads
,
-
1
,
self
.
head_dim
)
)
if
v
is
not
None
:
v
=
(
v
.
view
(
bsz
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
*
self
.
num_heads
,
-
1
,
self
.
head_dim
)
)
assert
k
is
not
None
src_len
=
k
.
size
(
1
)
# This is part of a workaround to get around fork/join parallelism
# not supporting Optional types.
if
key_padding_mask
is
not
None
and
key_padding_mask
.
dim
()
==
0
:
key_padding_mask
=
None
if
key_padding_mask
is
not
None
:
assert
key_padding_mask
.
size
(
0
)
==
bsz
assert
key_padding_mask
.
size
(
1
)
==
src_len
attn_weights
=
torch
.
bmm
(
q
,
k
.
transpose
(
1
,
2
))
assert
list
(
attn_weights
.
size
())
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
]
if
key_padding_mask
is
not
None
:
# don't attend to padding symbols
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
.
masked_fill_
(
key_padding_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
).
to
(
torch
.
bool
),
float
(
"-inf"
)
)
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
if
not
return_attn
:
attn
=
softmax_dropout
(
attn_weights
,
self
.
dropout
,
self
.
training
,
bias
=
attn_bias
,
)
else
:
attn_weights
+=
attn_bias
attn
=
softmax_dropout
(
attn_weights
,
self
.
dropout
,
self
.
training
,
inplace
=
False
,
)
o
=
torch
.
bmm
(
attn
,
v
)
assert
list
(
o
.
size
())
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
self
.
head_dim
]
o
=
(
o
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
,
tgt_len
,
embed_dim
)
)
o
=
self
.
out_proj
(
o
)
if
not
return_attn
:
return
o
else
:
return
o
,
attn_weights
,
attn
class
CrossMultiheadAttention
(
nn
.
Module
):
def
__init__
(
self
,
embed_dim
,
num_heads
,
dropout
=
0.1
,
bias
=
True
,
scaling_factor
=
1
,
):
super
().
__init__
()
self
.
embed_dim
=
embed_dim
self
.
num_heads
=
num_heads
self
.
dropout
=
dropout
self
.
head_dim
=
embed_dim
//
num_heads
assert
(
self
.
head_dim
*
num_heads
==
self
.
embed_dim
),
"embed_dim must be divisible by num_heads"
self
.
scaling
=
(
self
.
head_dim
*
scaling_factor
)
**
-
0.5
self
.
q_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
self
.
k_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
self
.
v_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
self
.
out_proj
=
nn
.
Linear
(
embed_dim
,
embed_dim
,
bias
=
bias
)
def
forward
(
self
,
query
,
key
,
value
,
key_padding_mask
:
Optional
[
Tensor
]
=
None
,
attn_bias
:
Optional
[
Tensor
]
=
None
,
)
->
Tensor
:
bsz
,
tgt_len
,
embed_dim
=
query
.
size
()
assert
embed_dim
==
self
.
embed_dim
q
=
self
.
q_proj
(
query
)
k
=
self
.
k_proj
(
key
)
v
=
self
.
v_proj
(
value
)
q
=
(
q
.
view
(
bsz
,
tgt_len
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
*
self
.
num_heads
,
-
1
,
self
.
head_dim
)
*
self
.
scaling
)
if
k
is
not
None
:
k
=
(
k
.
view
(
bsz
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
*
self
.
num_heads
,
-
1
,
self
.
head_dim
)
)
if
v
is
not
None
:
v
=
(
v
.
view
(
bsz
,
-
1
,
self
.
num_heads
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
*
self
.
num_heads
,
-
1
,
self
.
head_dim
)
)
assert
k
is
not
None
src_len
=
k
.
size
(
1
)
# This is part of a workaround to get around fork/join parallelism
# not supporting Optional types.
if
key_padding_mask
is
not
None
and
key_padding_mask
.
dim
()
==
0
:
key_padding_mask
=
None
if
key_padding_mask
is
not
None
:
assert
key_padding_mask
.
size
(
0
)
==
bsz
assert
key_padding_mask
.
size
(
1
)
==
src_len
attn_weights
=
torch
.
bmm
(
q
,
k
.
transpose
(
1
,
2
))
assert
list
(
attn_weights
.
size
())
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
]
if
key_padding_mask
is
not
None
:
# don't attend to padding symbols
attn_weights
=
attn_weights
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
)
attn_weights
.
masked_fill_
(
key_padding_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
).
to
(
torch
.
bool
),
float
(
"-inf"
)
)
attn_weights
=
attn_weights
.
view
(
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
)
attn
=
softmax_dropout
(
attn_weights
,
self
.
dropout
,
self
.
training
,
bias
=
attn_bias
)
o
=
torch
.
bmm
(
attn
,
v
)
assert
list
(
o
.
size
())
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
self
.
head_dim
]
o
=
(
o
.
view
(
bsz
,
self
.
num_heads
,
tgt_len
,
self
.
head_dim
)
.
transpose
(
1
,
2
)
.
contiguous
()
.
view
(
bsz
,
tgt_len
,
embed_dim
)
)
o
=
self
.
out_proj
(
o
)
return
o
Uni-Core-main/unicore/modules/softmax_dropout.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch
import
torch.nn.functional
as
F
try
:
import
unicore_fused_softmax_dropout
HAS_SOFTMAX
=
True
except
:
print
(
"fused_softmax is not installed corrected"
)
HAS_SOFTMAX
=
False
if
not
torch
.
cuda
.
is_available
()
or
torch
.
cuda
.
get_device_capability
()[
0
]
<
7
:
HAS_SOFTMAX
=
False
class
SoftmaxDropoutFast
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
is_training
,
inputs
,
mask
,
bias
,
dropout_prob
):
(
dropout_results
,
dropout_mask
,
softmax_results
,
)
=
unicore_fused_softmax_dropout
.
forward
(
is_training
,
inputs
,
mask
,
bias
,
dropout_prob
,
None
)
if
is_training
:
ctx
.
dropout_prob
=
dropout_prob
ctx
.
save_for_backward
(
softmax_results
,
dropout_mask
)
ctx
.
has_bias
=
bias
is
not
None
and
bias
.
requires_grad
if
ctx
.
has_bias
:
ctx
.
bias_batch_dim
=
bias
.
shape
[
0
]
return
dropout_results
@
staticmethod
def
backward
(
ctx
,
grad_output
):
softmax_results
,
dropout_mask
=
ctx
.
saved_tensors
dropout_prob
=
ctx
.
dropout_prob
grad_output
=
grad_output
.
contiguous
()
grad_input
=
unicore_fused_softmax_dropout
.
backward
(
grad_output
,
softmax_results
,
dropout_mask
,
dropout_prob
)
if
ctx
.
has_bias
:
grad_bias
=
grad_input
.
view
(
-
1
,
ctx
.
bias_batch_dim
,
grad_input
.
shape
[
-
2
],
grad_input
.
shape
[
-
1
]
).
sum
(
dim
=
0
)
else
:
grad_bias
=
None
return
None
,
grad_input
,
None
,
grad_bias
,
None
def
_check_mask
(
mask
,
input
):
try
:
assert
mask
.
dtype
==
input
.
dtype
,
"mask and input must have the same dtype"
assert
len
(
mask
.
shape
)
==
len
(
input
.
shape
),
"wrong length of mask.shape"
assert
(
mask
.
shape
[
-
3
]
==
1
or
mask
.
shape
[
-
3
]
==
input
.
shape
[
-
3
]
),
"mask.shape[-3] must be 1 or input.shape[-3]"
if
mask
.
shape
[
-
3
]
==
1
:
assert
mask
.
shape
[
-
2
]
==
1
,
"when mask.shape[-3] == 1, mask.shape[-2] must be 1"
else
:
assert
(
mask
.
shape
[
-
2
]
==
1
or
mask
.
shape
[
-
2
]
==
input
.
shape
[
-
2
]
),
"mask.shape[-2] must be 1 or input.shape[-2]"
return
True
except
:
return
False
def
_check_bias
(
bias
,
input
):
try
:
assert
bias
.
dtype
==
input
.
dtype
,
"bias and input must have the same dtype"
assert
len
(
bias
.
shape
)
==
len
(
input
.
shape
),
"wrong length of bias.shape"
assert
bias
.
shape
[
-
1
]
==
input
.
shape
[
-
1
],
"bias.shape[-1] must be input.shape[-1]"
assert
bias
.
shape
[
-
2
]
==
input
.
shape
[
-
2
],
"bias.shape[-2] must be input.shape[-2]"
len_shape
=
len
(
input
.
shape
)
if
len_shape
>
3
:
# head dim should be the same
assert
(
bias
.
shape
[
-
3
]
==
input
.
shape
[
-
3
]
),
"bias.shape[-3] must be input.shape[-3]"
offset
=
3
else
:
offset
=
2
prev_non_one
=
True
for
i
in
range
(
len_shape
-
offset
-
1
,
-
1
,
-
1
):
if
prev_non_one
:
assert
(
bias
.
shape
[
i
]
==
input
.
shape
[
i
]
or
bias
.
shape
[
i
]
==
1
),
"bias.shape[{}] must be input.shape[{}] or 1"
.
format
(
i
,
i
)
else
:
assert
bias
.
shape
[
i
]
==
1
,
"bias.shape[{}] must be 1"
.
format
(
i
)
prev_non_one
=
bias
.
shape
[
i
]
!=
1
return
True
except
:
return
False
def
softmax_dropout
(
input
,
dropout_prob
,
is_training
=
True
,
mask
=
None
,
bias
=
None
,
inplace
=
True
):
"""softmax dropout, and mask, bias are optional.
Args:
input (torch.Tensor): input tensor
dropout_prob (float): dropout probability
is_training (bool, optional): is in training or not. Defaults to True.
mask (torch.Tensor, optional): the mask tensor, use as input + mask . Defaults to None.
bias (torch.Tensor, optional): the bias tensor, use as input + bias . Defaults to None.
Returns:
torch.Tensor: the result after softmax
"""
input
=
input
.
contiguous
()
if
not
inplace
:
# copy a input for non-inplace case
input
=
input
.
clone
()
if
input
.
is_cuda
and
HAS_SOFTMAX
:
input_size
=
input
.
size
()
if
mask
is
not
None
:
if
_check_mask
(
mask
,
input
):
mask
=
mask
.
contiguous
().
view
(
-
1
,
mask
.
shape
[
-
2
],
mask
.
shape
[
-
1
])
else
:
input
+=
mask
mask
=
None
if
bias
is
not
None
:
if
_check_bias
(
bias
,
input
):
bias
=
bias
.
contiguous
().
view
(
-
1
,
input_size
[
-
2
],
input_size
[
-
1
])
else
:
input
+=
bias
bias
=
None
input
=
input
.
view
(
-
1
,
input_size
[
-
2
],
input_size
[
-
1
])
if
dropout_prob
<=
0.0
or
input_size
[
-
1
]
<=
1024
:
return
SoftmaxDropoutFast
.
apply
(
is_training
,
input
,
mask
,
bias
,
dropout_prob
).
view
(
*
input_size
)
else
:
return
F
.
dropout
(
SoftmaxDropoutFast
.
apply
(
is_training
,
input
,
mask
,
bias
,
0.0
).
view
(
*
input_size
),
p
=
dropout_prob
,
training
=
is_training
)
else
:
if
mask
is
not
None
:
input
+=
mask
if
bias
is
not
None
:
input
+=
bias
return
F
.
dropout
(
F
.
softmax
(
input
,
dim
=-
1
),
p
=
dropout_prob
,
training
=
is_training
)
Uni-Core-main/unicore/modules/transformer_decoder.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
typing
import
Optional
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
.
import
TransformerDecoderLayer
,
LayerNorm
from
.transformer_encoder
import
relative_position_bucket
def
fill_with_neg_inf
(
t
):
return
t
.
fill_
(
float
(
"-inf"
))
def
bulid_future_mask
(
seq_len
):
return
torch
.
triu
(
fill_with_neg_inf
(
torch
.
zeros
([
seq_len
,
seq_len
])),
1
)
class
TransformerDecoder
(
nn
.
Module
):
def
__init__
(
self
,
decoder_layers
:
int
=
6
,
embed_dim
:
int
=
768
,
ffn_embed_dim
:
int
=
3072
,
attention_heads
:
int
=
8
,
emb_dropout
:
float
=
0.1
,
dropout
:
float
=
0.1
,
attention_dropout
:
float
=
0.1
,
activation_dropout
:
float
=
0.0
,
max_seq_len
:
int
=
256
,
activation_fn
:
str
=
"gelu"
,
rel_pos
:
bool
=
True
,
rel_pos_bins
:
int
=
32
,
max_rel_pos
:
int
=
128
,
post_ln
:
bool
=
False
,
auto_regressive
:
bool
=
True
,
)
->
None
:
super
().
__init__
()
self
.
emb_dropout
=
emb_dropout
self
.
max_seq_len
=
max_seq_len
self
.
embed_dim
=
embed_dim
self
.
attention_heads
=
attention_heads
self
.
emb_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
auto_regressive
=
auto_regressive
if
self
.
auto_regressive
:
self
.
_future_mask
=
bulid_future_mask
(
self
.
max_seq_len
)
else
:
self
.
_future_mask
=
None
if
not
post_ln
:
self
.
final_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
else
:
self
.
final_layer_norm
=
None
self
.
layers
=
nn
.
ModuleList
(
[
TransformerDecoderLayer
(
embed_dim
=
self
.
embed_dim
,
ffn_embed_dim
=
ffn_embed_dim
,
attention_heads
=
attention_heads
,
dropout
=
dropout
,
attention_dropout
=
attention_dropout
,
activation_dropout
=
activation_dropout
,
activation_fn
=
activation_fn
,
post_ln
=
post_ln
,
)
for
_
in
range
(
decoder_layers
)
]
)
self
.
rel_pos
=
rel_pos
if
self
.
rel_pos
:
assert
rel_pos_bins
%
2
==
0
self
.
rel_pos_bins
=
rel_pos_bins
self
.
max_rel_pos
=
max_rel_pos
self
.
relative_attention_bias
=
nn
.
Embedding
(
self
.
rel_pos_bins
,
self
.
attention_heads
)
seq_len
=
self
.
max_seq_len
context_position
=
torch
.
arange
(
seq_len
,
dtype
=
torch
.
long
)[:,
None
]
memory_position
=
torch
.
arange
(
seq_len
,
dtype
=
torch
.
long
)[
None
,
:]
relative_position
=
memory_position
-
context_position
self
.
rp_bucket
=
relative_position_bucket
(
relative_position
,
num_buckets
=
self
.
rel_pos_bins
,
max_distance
=
self
.
max_rel_pos
)
self
.
rp_bucket
-=
self
.
rp_bucket
.
min
()
def
get_rel_pos_bias
(
self
,
x
):
# Assume the input is ordered. If your input token is permuted, you may need to update this accordingly
if
self
.
rp_bucket
.
device
!=
x
.
device
:
self
.
rp_bucket
=
self
.
rp_bucket
.
to
(
x
.
device
)
seq_len
=
x
.
size
(
1
)
rp_bucket
=
self
.
rp_bucket
[:
seq_len
,
:
seq_len
]
values
=
F
.
embedding
(
rp_bucket
,
self
.
relative_attention_bias
.
weight
)
values
=
values
.
permute
([
2
,
0
,
1
])
return
values
.
contiguous
()
def
get_future_mask
(
self
,
x
,
attn_mask
):
if
not
self
.
auto_regressive
:
return
attn_mask
if
self
.
_future_mask
.
device
!=
x
.
device
:
self
.
_future_mask
=
self
.
_future_mask
.
to
(
x
.
device
)
if
self
.
_future_mask
.
dtype
!=
x
.
dtype
:
self
.
_future_mask
=
self
.
_future_mask
.
type_as
(
x
)
if
attn_mask
is
None
:
ret
=
self
.
_future_mask
[:
x
.
size
(
1
),
:
x
.
size
(
1
)]
ret
=
ret
.
contiguous
().
unsqueeze
(
0
).
repeat
(
x
.
size
(
0
)
*
self
.
attention_heads
,
1
,
1
)
return
ret
else
:
assert
list
(
attn_mask
.
size
())
==
[
x
.
size
(
0
)
*
self
.
attention_heads
,
x
.
size
(
1
),
x
.
size
(
1
)]
return
attn_mask
+
self
.
_future_mask
[:
x
.
size
(
1
),
:
x
.
size
(
1
)]
def
forward
(
self
,
emb
,
encoder_out
:
Optional
[
torch
.
Tensor
]
=
None
,
padding_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_padding_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
seq_len
=
emb
.
size
(
1
)
x
=
self
.
emb_layer_norm
(
emb
)
x
=
F
.
dropout
(
x
,
p
=
self
.
emb_dropout
,
training
=
self
.
training
)
# account for padding while computing the representation
if
padding_mask
is
not
None
:
x
=
x
*
(
1
-
padding_mask
.
unsqueeze
(
-
1
).
type_as
(
x
))
rel_pos_bias
=
self
.
get_rel_pos_bias
(
x
).
repeat
(
x
.
size
(
0
),
1
,
1
)
if
self
.
rel_pos
else
None
if
attn_mask
is
None
:
attn_mask
=
rel_pos_bias
elif
rel_pos_bias
is
not
None
:
attn_mask
+=
rel_pos_bias
if
self
.
auto_regressive
:
attn_mask
=
self
.
get_future_mask
(
x
,
attn_mask
)
if
attn_mask
is
not
None
and
padding_mask
is
not
None
:
# merge key_padding_mask and attn_mask
attn_mask
=
attn_mask
.
view
(
x
.
size
(
0
),
-
1
,
seq_len
,
seq_len
)
attn_mask
.
masked_fill_
(
padding_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
).
to
(
torch
.
bool
),
float
(
"-inf"
)
)
attn_mask
=
attn_mask
.
view
(
-
1
,
seq_len
,
seq_len
)
padding_mask
=
None
for
layer
in
self
.
layers
:
x
=
layer
(
x
,
encoder_out
=
encoder_out
,
padding_mask
=
padding_mask
,
attn_bias
=
attn_mask
,
encoder_padding_mask
=
encoder_padding_mask
,
encoder_attn_bias
=
encoder_attn_mask
)
if
self
.
final_layer_norm
is
not
None
:
x
=
self
.
final_layer_norm
(
x
)
return
x
Uni-Core-main/unicore/modules/transformer_decoder_layer.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
typing
import
Dict
,
Optional
import
torch
import
torch.nn.functional
as
F
from
unicore
import
utils
from
torch
import
nn
from
.
import
LayerNorm
,
SelfMultiheadAttention
,
CrossMultiheadAttention
class
TransformerDecoderLayer
(
nn
.
Module
):
"""
Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
models.
"""
def
__init__
(
self
,
embed_dim
:
int
=
768
,
ffn_embed_dim
:
int
=
3072
,
attention_heads
:
int
=
8
,
dropout
:
float
=
0.1
,
attention_dropout
:
float
=
0.1
,
activation_dropout
:
float
=
0.0
,
activation_fn
:
str
=
"gelu"
,
post_ln
=
False
,
)
->
None
:
super
().
__init__
()
# Initialize parameters
self
.
embed_dim
=
embed_dim
self
.
attention_heads
=
attention_heads
self
.
attention_dropout
=
attention_dropout
self
.
dropout
=
dropout
self
.
activation_dropout
=
activation_dropout
self
.
activation_fn
=
utils
.
get_activation_fn
(
activation_fn
)
self
.
self_attn
=
SelfMultiheadAttention
(
self
.
embed_dim
,
attention_heads
,
dropout
=
attention_dropout
,
)
# layer norm associated with the self attention layer
self
.
self_attn_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
encoder_attn
=
CrossMultiheadAttention
(
self
.
embed_dim
,
attention_heads
,
dropout
=
attention_dropout
,
)
# layer norm associated with the self attention layer
self
.
encoder_attn_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
fc1
=
nn
.
Linear
(
self
.
embed_dim
,
ffn_embed_dim
)
self
.
fc2
=
nn
.
Linear
(
ffn_embed_dim
,
self
.
embed_dim
)
self
.
final_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
post_ln
=
post_ln
def
forward
(
self
,
x
:
torch
.
Tensor
,
encoder_out
:
torch
.
Tensor
=
None
,
attn_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
padding_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_attn_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
encoder_padding_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
"""
LayerNorm is applied either before or after the self-attention/ffn
modules similar to the original Transformer implementation.
"""
residual
=
x
if
not
self
.
post_ln
:
x
=
self
.
self_attn_layer_norm
(
x
)
# new added
x
=
self
.
self_attn
(
query
=
x
,
key_padding_mask
=
padding_mask
,
attn_bias
=
attn_bias
,
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
if
self
.
post_ln
:
x
=
self
.
self_attn_layer_norm
(
x
)
if
encoder_out
is
not
None
:
residual
=
x
if
not
self
.
post_ln
:
x
=
self
.
encoder_attn_layer_norm
(
x
)
x
=
self
.
encoder_attn
(
query
=
x
,
key
=
encoder_out
,
value
=
encoder_out
,
key_padding_mask
=
encoder_padding_mask
,
attn_bias
=
encoder_attn_bias
,
)
#x = self.dropout_module(x)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
if
self
.
post_ln
:
x
=
self
.
encoder_attn_layer_norm
(
x
)
residual
=
x
if
not
self
.
post_ln
:
x
=
self
.
final_layer_norm
(
x
)
x
=
self
.
fc1
(
x
)
x
=
self
.
activation_fn
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
if
self
.
post_ln
:
x
=
self
.
final_layer_norm
(
x
)
return
x
Uni-Core-main/unicore/modules/transformer_encoder.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
typing
import
Optional
import
math
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
.
import
TransformerEncoderLayer
,
LayerNorm
def
init_bert_params
(
module
):
if
not
getattr
(
module
,
'can_global_init'
,
True
):
return
def
normal_
(
data
):
data
.
copy_
(
data
.
cpu
().
normal_
(
mean
=
0.0
,
std
=
0.02
).
to
(
data
.
device
)
)
if
isinstance
(
module
,
nn
.
Linear
):
normal_
(
module
.
weight
.
data
)
if
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
if
isinstance
(
module
,
nn
.
Embedding
):
normal_
(
module
.
weight
.
data
)
if
module
.
padding_idx
is
not
None
:
module
.
weight
.
data
[
module
.
padding_idx
].
zero_
()
def
relative_position_bucket
(
relative_position
,
num_buckets
=
32
,
max_distance
=
128
):
sign
=
torch
.
sign
(
relative_position
)
num_buckets
//=
2
n
=
torch
.
abs
(
relative_position
)
# half of the buckets are for exact increments in positions
max_exact
=
num_buckets
//
2
is_small
=
n
<
max_exact
max_bucket_val
=
num_buckets
-
1
-
max_exact
# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
val_if_large
=
max_exact
+
torch
.
ceil
(
torch
.
log
(
n
.
float
()
/
max_exact
)
/
math
.
log
((
max_distance
-
1
)
/
max_exact
)
*
(
max_bucket_val
)
).
long
()
val_if_large
=
torch
.
min
(
val_if_large
,
torch
.
full_like
(
val_if_large
,
num_buckets
-
1
))
ret
=
torch
.
where
(
is_small
,
n
,
val_if_large
)
*
sign
return
ret
class
TransformerEncoder
(
nn
.
Module
):
def
__init__
(
self
,
encoder_layers
:
int
=
6
,
embed_dim
:
int
=
768
,
ffn_embed_dim
:
int
=
3072
,
attention_heads
:
int
=
8
,
emb_dropout
:
float
=
0.1
,
dropout
:
float
=
0.1
,
attention_dropout
:
float
=
0.1
,
activation_dropout
:
float
=
0.0
,
max_seq_len
:
int
=
256
,
activation_fn
:
str
=
"gelu"
,
rel_pos
:
bool
=
True
,
rel_pos_bins
:
int
=
32
,
max_rel_pos
:
int
=
128
,
post_ln
:
bool
=
False
,
)
->
None
:
super
().
__init__
()
self
.
emb_dropout
=
emb_dropout
self
.
max_seq_len
=
max_seq_len
self
.
embed_dim
=
embed_dim
self
.
attention_heads
=
attention_heads
self
.
emb_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
if
not
post_ln
:
self
.
final_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
else
:
self
.
final_layer_norm
=
None
self
.
layers
=
nn
.
ModuleList
(
[
TransformerEncoderLayer
(
embed_dim
=
self
.
embed_dim
,
ffn_embed_dim
=
ffn_embed_dim
,
attention_heads
=
attention_heads
,
dropout
=
dropout
,
attention_dropout
=
attention_dropout
,
activation_dropout
=
activation_dropout
,
activation_fn
=
activation_fn
,
post_ln
=
post_ln
,
)
for
_
in
range
(
encoder_layers
)
]
)
self
.
rel_pos
=
rel_pos
if
self
.
rel_pos
:
assert
rel_pos_bins
%
2
==
0
self
.
rel_pos_bins
=
rel_pos_bins
self
.
max_rel_pos
=
max_rel_pos
self
.
relative_attention_bias
=
nn
.
Embedding
(
self
.
rel_pos_bins
,
self
.
attention_heads
)
seq_len
=
self
.
max_seq_len
context_position
=
torch
.
arange
(
seq_len
,
dtype
=
torch
.
long
)[:,
None
]
memory_position
=
torch
.
arange
(
seq_len
,
dtype
=
torch
.
long
)[
None
,
:]
relative_position
=
memory_position
-
context_position
self
.
rp_bucket
=
relative_position_bucket
(
relative_position
,
num_buckets
=
self
.
rel_pos_bins
,
max_distance
=
self
.
max_rel_pos
)
self
.
rp_bucket
-=
self
.
rp_bucket
.
min
()
def
get_rel_pos_bias
(
self
,
x
):
# Assume the input is ordered. If your input token is permuted, you may need to update this accordingly
if
self
.
rp_bucket
.
device
!=
x
.
device
:
self
.
rp_bucket
=
self
.
rp_bucket
.
to
(
x
.
device
)
seq_len
=
x
.
size
(
1
)
rp_bucket
=
self
.
rp_bucket
[:
seq_len
,
:
seq_len
]
values
=
F
.
embedding
(
rp_bucket
,
self
.
relative_attention_bias
.
weight
)
values
=
values
.
permute
([
2
,
0
,
1
])
return
values
.
contiguous
()
def
forward
(
self
,
emb
:
torch
.
Tensor
,
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
padding_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
seq_len
=
emb
.
size
(
1
)
x
=
self
.
emb_layer_norm
(
emb
)
x
=
F
.
dropout
(
x
,
p
=
self
.
emb_dropout
,
training
=
self
.
training
)
# account for padding while computing the representation
if
padding_mask
is
not
None
:
x
=
x
*
(
1
-
padding_mask
.
unsqueeze
(
-
1
).
type_as
(
x
))
rel_pos_bias
=
self
.
get_rel_pos_bias
(
x
).
repeat
(
x
.
size
(
0
),
1
,
1
)
if
self
.
rel_pos
else
None
if
attn_mask
is
None
:
attn_mask
=
rel_pos_bias
elif
rel_pos_bias
is
not
None
:
attn_mask
+=
rel_pos_bias
if
attn_mask
is
not
None
and
padding_mask
is
not
None
:
# merge key_padding_mask and attn_mask
attn_mask
=
attn_mask
.
view
(
x
.
size
(
0
),
-
1
,
seq_len
,
seq_len
)
attn_mask
.
masked_fill_
(
padding_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
).
to
(
torch
.
bool
),
float
(
"-inf"
)
)
attn_mask
=
attn_mask
.
view
(
-
1
,
seq_len
,
seq_len
)
padding_mask
=
None
for
layer
in
self
.
layers
:
x
=
layer
(
x
,
padding_mask
=
padding_mask
,
attn_bias
=
attn_mask
)
if
self
.
final_layer_norm
is
not
None
:
x
=
self
.
final_layer_norm
(
x
)
return
x
\ No newline at end of file
Uni-Core-main/unicore/modules/transformer_encoder_layer.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
typing
import
Dict
,
Optional
import
torch
import
torch.nn.functional
as
F
from
unicore
import
utils
from
torch
import
nn
from
.
import
LayerNorm
,
SelfMultiheadAttention
class
TransformerEncoderLayer
(
nn
.
Module
):
"""
Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
models.
"""
def
__init__
(
self
,
embed_dim
:
int
=
768
,
ffn_embed_dim
:
int
=
3072
,
attention_heads
:
int
=
8
,
dropout
:
float
=
0.1
,
attention_dropout
:
float
=
0.1
,
activation_dropout
:
float
=
0.0
,
activation_fn
:
str
=
"gelu"
,
post_ln
=
False
,
)
->
None
:
super
().
__init__
()
# Initialize parameters
self
.
embed_dim
=
embed_dim
self
.
attention_heads
=
attention_heads
self
.
attention_dropout
=
attention_dropout
self
.
dropout
=
dropout
self
.
activation_dropout
=
activation_dropout
self
.
activation_fn
=
utils
.
get_activation_fn
(
activation_fn
)
self
.
self_attn
=
SelfMultiheadAttention
(
self
.
embed_dim
,
attention_heads
,
dropout
=
attention_dropout
,
)
# layer norm associated with the self attention layer
self
.
self_attn_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
fc1
=
nn
.
Linear
(
self
.
embed_dim
,
ffn_embed_dim
)
self
.
fc2
=
nn
.
Linear
(
ffn_embed_dim
,
self
.
embed_dim
)
self
.
final_layer_norm
=
LayerNorm
(
self
.
embed_dim
)
self
.
post_ln
=
post_ln
def
forward
(
self
,
x
:
torch
.
Tensor
,
attn_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
padding_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
return_attn
:
bool
=
False
,
)
->
torch
.
Tensor
:
"""
LayerNorm is applied either before or after the self-attention/ffn
modules similar to the original Transformer implementation.
"""
residual
=
x
if
not
self
.
post_ln
:
x
=
self
.
self_attn_layer_norm
(
x
)
# new added
x
=
self
.
self_attn
(
query
=
x
,
key_padding_mask
=
padding_mask
,
attn_bias
=
attn_bias
,
return_attn
=
return_attn
,
)
if
return_attn
:
x
,
attn_weights
,
attn_probs
=
x
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
if
self
.
post_ln
:
x
=
self
.
self_attn_layer_norm
(
x
)
residual
=
x
if
not
self
.
post_ln
:
x
=
self
.
final_layer_norm
(
x
)
x
=
self
.
fc1
(
x
)
x
=
self
.
activation_fn
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
activation_dropout
,
training
=
self
.
training
)
x
=
self
.
fc2
(
x
)
x
=
F
.
dropout
(
x
,
p
=
self
.
dropout
,
training
=
self
.
training
)
x
=
residual
+
x
if
self
.
post_ln
:
x
=
self
.
final_layer_norm
(
x
)
if
not
return_attn
:
return
x
else
:
return
x
,
attn_weights
,
attn_probs
\ No newline at end of file
Uni-Core-main/unicore/nan_detector.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
logging
import
torch
logger
=
logging
.
getLogger
(
__name__
)
class
NanDetector
:
"""
Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name
"""
def
__init__
(
self
,
model
,
forward
=
True
,
backward
=
True
):
self
.
bhooks
=
[]
self
.
fhooks
=
[]
self
.
forward
=
forward
self
.
backward
=
backward
self
.
named_parameters
=
list
(
model
.
named_parameters
())
self
.
reset
()
for
name
,
mod
in
model
.
named_modules
():
mod
.
__module_name
=
name
self
.
add_hooks
(
mod
)
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
exc_traceback
):
# Dump out all model gnorms to enable better debugging
norm
=
{}
gradients
=
{}
for
name
,
param
in
self
.
named_parameters
:
if
param
.
grad
is
not
None
:
grad_norm
=
torch
.
norm
(
param
.
grad
.
data
,
p
=
2
,
dtype
=
torch
.
float32
)
norm
[
name
]
=
grad_norm
.
item
()
if
torch
.
isnan
(
grad_norm
).
any
()
or
torch
.
isinf
(
grad_norm
).
any
():
gradients
[
name
]
=
param
.
grad
.
data
if
len
(
gradients
)
>
0
:
logger
.
info
(
"Detected nan/inf grad norm, dumping norms..."
)
logger
.
info
(
f
"norms:
{
norm
}
"
)
logger
.
info
(
f
"gradients:
{
gradients
}
"
)
self
.
close
()
def
add_hooks
(
self
,
module
):
if
self
.
forward
:
self
.
fhooks
.
append
(
module
.
register_forward_hook
(
self
.
fhook_fn
))
if
self
.
backward
:
self
.
bhooks
.
append
(
module
.
register_backward_hook
(
self
.
bhook_fn
))
def
reset
(
self
):
self
.
has_printed_f
=
False
self
.
has_printed_b
=
False
def
_detect
(
self
,
tensor
,
name
,
backward
):
err
=
None
if
(
torch
.
is_floating_point
(
tensor
)
# single value tensors (like the loss) will not provide much info
and
tensor
.
numel
()
>=
2
):
with
torch
.
no_grad
():
if
torch
.
isnan
(
tensor
).
any
():
err
=
"NaN"
elif
torch
.
isinf
(
tensor
).
any
():
err
=
"Inf"
if
err
is
not
None
:
err
=
f
"
{
err
}
detected in output of
{
name
}
, shape:
{
tensor
.
shape
}
,
{
'backward'
if
backward
else
'forward'
}
"
return
err
def
_apply
(
self
,
module
,
inp
,
x
,
backward
):
if
torch
.
is_tensor
(
x
):
if
isinstance
(
inp
,
tuple
)
and
len
(
inp
)
>
0
:
inp
=
inp
[
0
]
err
=
self
.
_detect
(
x
,
module
.
__module_name
,
backward
)
if
err
is
not
None
:
if
torch
.
is_tensor
(
inp
)
and
not
backward
:
err
+=
(
f
" input max:
{
inp
.
max
().
item
()
}
, input min:
{
inp
.
min
().
item
()
}
"
)
has_printed_attr
=
"has_printed_b"
if
backward
else
"has_printed_f"
logger
.
warning
(
err
)
setattr
(
self
,
has_printed_attr
,
True
)
elif
isinstance
(
x
,
dict
):
for
v
in
x
.
values
():
self
.
_apply
(
module
,
inp
,
v
,
backward
)
elif
isinstance
(
x
,
list
)
or
isinstance
(
x
,
tuple
):
for
v
in
x
:
self
.
_apply
(
module
,
inp
,
v
,
backward
)
def
fhook_fn
(
self
,
module
,
inp
,
output
):
if
not
self
.
has_printed_f
:
self
.
_apply
(
module
,
inp
,
output
,
backward
=
False
)
def
bhook_fn
(
self
,
module
,
inp
,
output
):
if
not
self
.
has_printed_b
:
self
.
_apply
(
module
,
inp
,
output
,
backward
=
True
)
def
close
(
self
):
for
hook
in
self
.
fhooks
+
self
.
bhooks
:
hook
.
remove
()
Uni-Core-main/unicore/optim/__init__.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""isort:skip_file"""
import
importlib
import
os
from
unicore
import
registry
from
unicore.optim.unicore_optimizer
import
(
# noqa
UnicoreOptimizer
,
)
from
unicore.optim.fp16_optimizer
import
FP16Optimizer
__all__
=
[
"UnicoreOptimizer"
,
"FP16Optimizer"
,
]
(
_build_optimizer
,
register_optimizer
,
OPTIMIZER_REGISTRY
)
=
registry
.
setup_registry
(
"--optimizer"
,
base_class
=
UnicoreOptimizer
,
default
=
'adam'
)
def
build_optimizer
(
args
,
params
,
*
extra_args
,
**
extra_kwargs
):
if
all
(
isinstance
(
p
,
dict
)
for
p
in
params
):
params
=
[
t
for
p
in
params
for
t
in
p
.
values
()]
params
=
list
(
filter
(
lambda
p
:
p
.
requires_grad
,
params
))
return
_build_optimizer
(
args
,
params
,
*
extra_args
,
**
extra_kwargs
)
# automatically import any Python files in the optim/ directory
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
".py"
)
and
not
file
.
startswith
(
"_"
):
file_name
=
file
[:
file
.
find
(
".py"
)]
importlib
.
import_module
(
"unicore.optim."
+
file_name
)
Uni-Core-main/unicore/optim/adadelta.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch.optim
from
.
import
UnicoreOptimizer
,
register_optimizer
@
register_optimizer
(
"adadelta"
)
class
Adadelta
(
UnicoreOptimizer
):
def
__init__
(
self
,
args
,
params
):
super
().
__init__
(
args
)
self
.
_optimizer
=
torch
.
optim
.
Adadelta
(
params
,
**
self
.
optimizer_config
)
@
staticmethod
def
add_args
(
parser
):
"""Add optimizer-specific arguments to the parser."""
# fmt: off
parser
.
add_argument
(
'--adadelta-rho'
,
type
=
float
,
default
=
0.9
,
metavar
=
'RHO'
,
help
=
'coefficient used for computing a running average of squared gradients'
)
parser
.
add_argument
(
'--adadelta-eps'
,
type
=
float
,
default
=
1e-6
,
metavar
=
'EPS'
,
help
=
'term added to the denominator to improve numerical stability'
)
parser
.
add_argument
(
'--weight-decay'
,
'--wd'
,
default
=
0.0
,
type
=
float
,
metavar
=
'WD'
,
help
=
'weight decay'
)
parser
.
add_argument
(
'--anneal-eps'
,
action
=
'store_true'
,
help
=
'flag to anneal eps'
)
# fmt: on
@
property
def
optimizer_config
(
self
):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
return
{
"lr"
:
self
.
args
.
lr
[
0
],
"rho"
:
self
.
args
.
adadelta_rho
,
"eps"
:
self
.
args
.
adadelta_eps
,
"weight_decay"
:
self
.
args
.
weight_decay
,
}
@
property
def
supports_flat_params
(
self
):
return
True
Uni-Core-main/unicore/optim/adagrad.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch.optim
from
.
import
UnicoreOptimizer
,
register_optimizer
@
register_optimizer
(
"adagrad"
)
class
Adagrad
(
UnicoreOptimizer
):
def
__init__
(
self
,
args
,
params
):
super
().
__init__
(
args
)
self
.
_optimizer
=
torch
.
optim
.
Adagrad
(
params
,
**
self
.
optimizer_config
)
@
staticmethod
def
add_args
(
parser
):
"""Add optimizer-specific arguments to the parser."""
# fmt: off
parser
.
add_argument
(
'--weight-decay'
,
'--wd'
,
default
=
0.0
,
type
=
float
,
metavar
=
'WD'
,
help
=
'weight decay'
)
# fmt: on
@
property
def
optimizer_config
(
self
):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
return
{
"lr"
:
self
.
args
.
lr
[
0
],
"weight_decay"
:
self
.
args
.
weight_decay
,
}
@
property
def
supports_flat_params
(
self
):
return
False
Uni-Core-main/unicore/optim/adam.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
logging
import
math
from
collections.abc
import
Collection
from
typing
import
List
import
torch
import
torch.optim
from
unicore.optim
import
UnicoreOptimizer
,
register_optimizer
from
unicore.optim.fused_adam
import
get_fused_adam_class
logger
=
logging
.
getLogger
(
__name__
)
@
register_optimizer
(
"adam"
)
class
UnicoreAdam
(
UnicoreOptimizer
):
"""Adam optimizer for unicore.
Important note: this optimizer corresponds to the "AdamW" variant of
Adam in its weight decay behavior. As such, it is most closely
analogous to torch.optim.AdamW from PyTorch.
"""
def
__init__
(
self
,
args
,
params
):
super
().
__init__
(
args
)
fused_adam_cls
=
get_fused_adam_class
()
use_fused_adam
=
(
not
getattr
(
args
,
"use_old_adam"
,
False
)
and
fused_adam_cls
is
not
None
and
torch
.
cuda
.
is_available
()
and
torch
.
cuda
.
get_device_capability
()[
0
]
>=
7
)
if
use_fused_adam
:
logger
.
info
(
"using FusedAdam"
)
self
.
_optimizer
=
fused_adam_cls
(
params
,
**
self
.
optimizer_config
)
else
:
self
.
_optimizer
=
Adam
(
params
,
**
self
.
optimizer_config
)
@
staticmethod
def
add_args
(
parser
):
"""Add optimizer-specific arguments to the parser."""
# fmt: off
parser
.
add_argument
(
'--adam-betas'
,
default
=
'(0.9, 0.999)'
,
metavar
=
'B'
,
help
=
'betas for Adam optimizer'
)
parser
.
add_argument
(
'--adam-eps'
,
type
=
float
,
default
=
1e-8
,
metavar
=
'D'
,
help
=
'epsilon for Adam optimizer'
)
parser
.
add_argument
(
'--weight-decay'
,
'--wd'
,
default
=
0.0
,
type
=
float
,
metavar
=
'WD'
,
help
=
'weight decay'
)
# fmt: on
@
property
def
optimizer_config
(
self
):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
return
{
"lr"
:
self
.
args
.
lr
[
0
]
if
isinstance
(
self
.
args
.
lr
,
Collection
)
else
self
.
args
.
lr
,
"betas"
:
eval
(
self
.
args
.
adam_betas
),
"eps"
:
self
.
args
.
adam_eps
,
"weight_decay"
:
self
.
args
.
weight_decay
,
}
class
Adam
(
torch
.
optim
.
Optimizer
):
r
"""Implements Adam algorithm.
This implementation is modified from torch.optim.Adam based on:
`Fixed Weight Decay Regularization in Adam`
(see https://arxiv.org/abs/1711.05101)
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def
__init__
(
self
,
params
,
lr
=
1e-3
,
betas
=
(
0.9
,
0.999
),
eps
=
1e-8
,
weight_decay
=
0
,
amsgrad
=
False
,
):
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
amsgrad
=
amsgrad
)
super
(
Adam
,
self
).
__init__
(
params
,
defaults
)
@
property
def
supports_memory_efficient_fp16
(
self
):
return
True
@
property
def
supports_flat_params
(
self
):
return
True
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step.
Args:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss
=
None
if
closure
is
not
None
:
loss
=
closure
()
for
group
in
self
.
param_groups
:
for
p
in
group
[
"params"
]:
if
p
.
grad
is
None
:
continue
grad
=
p
.
grad
.
data
if
grad
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}:
grad
=
grad
.
float
()
if
grad
.
is_sparse
:
raise
RuntimeError
(
"Adam does not support sparse gradients, please consider SparseAdam instead"
)
amsgrad
=
group
.
get
(
"amsgrad"
,
False
)
p_data_fp32
=
p
.
data
if
p
.
data
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}:
p_data_fp32
=
p_data_fp32
.
float
()
state
=
self
.
state
[
p
]
# State initialization
if
len
(
state
)
==
0
:
state
[
"step"
]
=
0
# Exponential moving average of gradient values
state
[
"exp_avg"
]
=
torch
.
zeros_like
(
p_data_fp32
)
# Exponential moving average of squared gradient values
state
[
"exp_avg_sq"
]
=
torch
.
zeros_like
(
p_data_fp32
)
if
amsgrad
:
# Maintains max of all exp. moving avg. of sq. grad. values
state
[
"max_exp_avg_sq"
]
=
torch
.
zeros_like
(
p_data_fp32
)
else
:
state
[
"exp_avg"
]
=
state
[
"exp_avg"
].
to
(
p_data_fp32
)
state
[
"exp_avg_sq"
]
=
state
[
"exp_avg_sq"
].
to
(
p_data_fp32
)
if
amsgrad
:
state
[
"max_exp_avg_sq"
]
=
state
[
"max_exp_avg_sq"
].
to
(
p_data_fp32
)
exp_avg
,
exp_avg_sq
=
state
[
"exp_avg"
],
state
[
"exp_avg_sq"
]
if
amsgrad
:
max_exp_avg_sq
=
state
[
"max_exp_avg_sq"
]
beta1
,
beta2
=
group
[
"betas"
]
state
[
"step"
]
+=
1
# Decay the first and second moment running average coefficient
exp_avg
.
mul_
(
beta1
).
add_
(
grad
,
alpha
=
1
-
beta1
)
exp_avg_sq
.
mul_
(
beta2
).
addcmul_
(
grad
,
grad
,
value
=
1
-
beta2
)
if
amsgrad
:
# Maintains the maximum of all 2nd moment running avg. till now
torch
.
max
(
max_exp_avg_sq
,
exp_avg_sq
,
out
=
max_exp_avg_sq
)
# Use the max. for normalizing running avg. of gradient
denom
=
max_exp_avg_sq
.
sqrt
().
add_
(
group
[
"eps"
])
else
:
denom
=
exp_avg_sq
.
sqrt
().
add_
(
group
[
"eps"
])
bias_correction1
=
1
-
beta1
**
state
[
"step"
]
bias_correction2
=
1
-
beta2
**
state
[
"step"
]
step_size
=
group
[
"lr"
]
*
math
.
sqrt
(
bias_correction2
)
/
bias_correction1
if
group
[
"weight_decay"
]
!=
0
:
p_data_fp32
.
add_
(
p_data_fp32
,
alpha
=-
group
[
"weight_decay"
]
*
group
[
"lr"
]
)
p_data_fp32
.
addcdiv_
(
exp_avg
,
denom
,
value
=-
step_size
)
if
p
.
data
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}:
p
.
data
.
copy_
(
p_data_fp32
)
return
loss
Uni-Core-main/unicore/optim/dynamic_loss_scaler.py
0 → 100644
View file @
a1c29028
# Copyright (c) DP Technology.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
class
DynamicLossScaler
(
object
):
def
__init__
(
self
,
init_scale
=
2.0
**
15
,
scale_factor
=
2.0
,
scale_window
=
2000
,
tolerance
=
0.0
,
threshold
=
None
,
min_loss_scale
=
1e-4
,
):
self
.
loss_scale
=
init_scale
self
.
scale_factor
=
scale_factor
self
.
scale_window
=
scale_window
self
.
tolerance
=
tolerance
self
.
threshold
=
threshold
self
.
_iter
=
0
self
.
_last_overflow_iter
=
-
1
self
.
_last_rescale_iter
=
-
1
self
.
_overflows_since_rescale
=
0
self
.
min_loss_scale
=
min_loss_scale
def
scale
(
self
,
outputs
):
return
self
.
loss_scale
*
outputs
def
update
(
self
):
if
(
self
.
_iter
-
self
.
_last_overflow_iter
)
%
self
.
scale_window
==
0
:
self
.
loss_scale
*=
self
.
scale_factor
self
.
_last_rescale_iter
=
self
.
_iter
self
.
_iter
+=
1
def
_decrease_loss_scale
(
self
):
self
.
loss_scale
/=
self
.
scale_factor
if
self
.
threshold
is
not
None
:
self
.
loss_scale
=
max
(
self
.
loss_scale
,
self
.
threshold
)
def
check_overflow
(
self
,
grad_norm
):
# detect inf and nan
if
grad_norm
==
float
(
"inf"
)
or
grad_norm
!=
grad_norm
:
# overflow has occured
prev_scale
=
self
.
loss_scale
iter_since_rescale
=
self
.
_iter
-
self
.
_last_rescale_iter
self
.
_last_overflow_iter
=
self
.
_iter
self
.
_overflows_since_rescale
+=
1
pct_overflow
=
self
.
_overflows_since_rescale
/
float
(
iter_since_rescale
)
if
pct_overflow
>=
self
.
tolerance
:
self
.
_decrease_loss_scale
()
self
.
_last_rescale_iter
=
self
.
_iter
self
.
_overflows_since_rescale
=
0
if
self
.
loss_scale
<=
self
.
min_loss_scale
:
# Use FloatingPointError as an uncommon error that parent
# functions can safely catch to stop training.
self
.
loss_scale
=
prev_scale
raise
FloatingPointError
(
(
"Minimum loss scale reached ({}). Your loss is probably exploding. "
"Try lowering the learning rate, using gradient clipping or "
"increasing the batch size."
).
format
(
self
.
min_loss_scale
)
)
self
.
_iter
+=
1
raise
OverflowError
(
"setting loss scale to: "
+
str
(
self
.
loss_scale
))
Prev
1
2
3
4
5
6
7
8
9
10
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment