Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
c0f05c10
Commit
c0f05c10
authored
Nov 29, 2022
by
hepj
Browse files
更新transformer代码
parent
c056df78
Changes
595
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3177 deletions
+0
-3177
PyTorch/NLP/Transformer/fairseq/ddp_trainer.py
PyTorch/NLP/Transformer/fairseq/ddp_trainer.py
+0
-305
PyTorch/NLP/Transformer/fairseq/distributed_utils.py
PyTorch/NLP/Transformer/fairseq/distributed_utils.py
+0
-111
PyTorch/NLP/Transformer/fairseq/log_helper.py
PyTorch/NLP/Transformer/fairseq/log_helper.py
+0
-204
PyTorch/NLP/Transformer/fairseq/meters.py
PyTorch/NLP/Transformer/fairseq/meters.py
+0
-87
PyTorch/NLP/Transformer/fairseq/models/__init__.py
PyTorch/NLP/Transformer/fairseq/models/__init__.py
+0
-55
PyTorch/NLP/Transformer/fairseq/models/fairseq_incremental_decoder.py
...Transformer/fairseq/models/fairseq_incremental_decoder.py
+0
-42
PyTorch/NLP/Transformer/fairseq/models/fused_layer_norm.py
PyTorch/NLP/Transformer/fairseq/models/fused_layer_norm.py
+0
-159
PyTorch/NLP/Transformer/fairseq/models/transformer.py
PyTorch/NLP/Transformer/fairseq/models/transformer.py
+0
-621
PyTorch/NLP/Transformer/fairseq/modules/__init__.py
PyTorch/NLP/Transformer/fairseq/modules/__init__.py
+0
-18
PyTorch/NLP/Transformer/fairseq/modules/learned_positional_embedding.py
...ansformer/fairseq/modules/learned_positional_embedding.py
+0
-31
PyTorch/NLP/Transformer/fairseq/modules/multihead_attention.py
...ch/NLP/Transformer/fairseq/modules/multihead_attention.py
+0
-460
PyTorch/NLP/Transformer/fairseq/modules/strided_batched_gemm/strided_batched_gemm.cpp
...seq/modules/strided_batched_gemm/strided_batched_gemm.cpp
+0
-61
PyTorch/NLP/Transformer/fairseq/modules/strided_batched_gemm/strided_batched_gemm_cuda.cu
...modules/strided_batched_gemm/strided_batched_gemm_cuda.cu
+0
-345
PyTorch/NLP/Transformer/fairseq/optim/__init__.py
PyTorch/NLP/Transformer/fairseq/optim/__init__.py
+0
-46
PyTorch/NLP/Transformer/fairseq/optim/adam.py
PyTorch/NLP/Transformer/fairseq/optim/adam.py
+0
-54
PyTorch/NLP/Transformer/fairseq/optim/fairseq_optimizer.py
PyTorch/NLP/Transformer/fairseq/optim/fairseq_optimizer.py
+0
-94
PyTorch/NLP/Transformer/fairseq/optim/lr_scheduler/__init__.py
...ch/NLP/Transformer/fairseq/optim/lr_scheduler/__init__.py
+0
-39
PyTorch/NLP/Transformer/fairseq/optim/lr_scheduler/fixed_schedule.py
.../Transformer/fairseq/optim/lr_scheduler/fixed_schedule.py
+0
-57
PyTorch/NLP/Transformer/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
...former/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
+0
-46
PyTorch/NLP/Transformer/fairseq/options.py
PyTorch/NLP/Transformer/fairseq/options.py
+0
-342
No files found.
PyTorch/NLP/Transformer/fairseq/ddp_trainer.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
#-------------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Train a network across multiple GPUs.
"""
import
math
from
collections
import
defaultdict
from
itertools
import
chain
import
torch
import
torch.nn.functional
as
F
from
torch.cuda
import
amp
from
apex.parallel
import
DistributedDataParallel
as
DDP
from
fairseq
import
distributed_utils
,
optim
,
utils
from
fairseq.optim
import
lr_scheduler
from
fairseq.meters
import
TimeMeter
,
AverageMeter
from
fairseq.criterions
import
CRITERION_REGISTRY
import
dllogger
as
DLLogger
class
DDPTrainer
():
"""Main class for data parallel training.
This class supports data parallel training, where multiple workers each
have a full model replica and gradients are accumulated synchronously via
torch.distributed.all_reduce.
"""
def
__init__
(
self
,
args
,
model
):
if
not
torch
.
cuda
.
is_available
():
raise
NotImplementedError
(
'Training on CPU is not supported'
)
self
.
args
=
args
self
.
model
=
model
.
cuda
()
self
.
criterion
=
CRITERION_REGISTRY
[
args
.
criterion
](
args
).
cuda
()
self
.
optimizer
=
optim
.
build_optimizer
(
self
.
args
,
self
.
model
.
parameters
())
self
.
lr_scheduler
=
lr_scheduler
.
build_lr_scheduler
(
self
.
args
,
self
.
optimizer
)
self
.
scaler
=
amp
.
GradScaler
(
enabled
=
self
.
args
.
amp
,
init_scale
=
2
**
15
)
if
self
.
args
.
distributed_world_size
>
1
:
self
.
model
=
DDP
(
model
)
self
.
_buffered_stats
=
defaultdict
(
lambda
:
[])
self
.
_num_updates
=
0
self
.
_optim_history
=
None
self
.
throughput_meter
=
TimeMeter
()
self
.
avg_loss_meter
=
AverageMeter
()
def
save_checkpoint
(
self
,
filename
,
extra_state
):
"""Save all training state in a checkpoint file."""
if
distributed_utils
.
is_master
(
self
.
args
):
# only save one checkpoint
utils
.
save_state
(
filename
,
self
.
args
,
self
.
get_model
(),
self
.
criterion
,
self
.
optimizer
,
self
.
lr_scheduler
,
self
.
_num_updates
,
self
.
_optim_history
,
extra_state
,
)
def
load_checkpoint
(
self
,
filename
,
load_optim
=
True
):
"""Load all training state from a checkpoint file."""
extra_state
,
optim_history
,
last_optim_state
=
\
utils
.
load_model_state
(
filename
,
self
.
get_model
())
if
last_optim_state
is
not
None
:
# rebuild optimizer after loading model, since params may have changed
#self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
self
.
lr_scheduler
=
lr_scheduler
.
build_lr_scheduler
(
self
.
args
,
self
.
optimizer
)
if
load_optim
:
self
.
_optim_history
=
optim_history
# only reload optimizer and lr_scheduler if they match
last_optim
=
self
.
_optim_history
[
-
1
]
if
last_optim
[
'criterion_name'
]
==
self
.
criterion
.
__class__
.
__name__
:
self
.
lr_scheduler
.
load_state_dict
(
last_optim
[
'lr_scheduler_state'
])
if
last_optim
[
'optimizer_name'
]
==
self
.
optimizer
.
__class__
.
__name__
:
self
.
optimizer
.
load_state_dict
(
last_optim_state
)
self
.
_num_updates
=
last_optim
[
'num_updates'
]
return
extra_state
def
train_step
(
self
,
sample
,
update_params
=
True
,
last_step
=
False
):
"""Do forward, backward and parameter update."""
# Set seed based on args.seed and the update number so that we get
# reproducible results when resuming from checkpoints
seed
=
self
.
args
.
seed
+
self
.
get_num_updates
()
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
self
.
model
.
train
()
if
isinstance
(
self
.
model
,
DDP
):
if
last_step
:
self
.
model
.
disable_allreduce
()
else
:
self
.
model
.
enable_allreduce
()
# forward and backward pass
sample
=
self
.
_prepare_sample
(
sample
)
loss
,
oom_fwd
=
self
.
_forward
(
sample
)
# If this is a last batch forward pass is skipped on some workers
# Batch with sample_size 0 is not accounted for in weighted loss
logging_output
=
{
'ntokens'
:
sample
[
'ntokens'
]
if
sample
is
not
None
else
0
,
'nsentences'
:
sample
[
'target'
].
size
(
0
)
if
sample
is
not
None
else
0
,
'loss'
:
utils
.
item
(
loss
.
data
)
if
loss
is
not
None
else
0
,
}
sample_size
=
sample
[
'ntokens'
]
if
sample
is
not
None
else
0
oom_bwd
=
self
.
_backward
(
loss
)
# buffer stats and logging outputs
self
.
_buffered_stats
[
'sample_sizes'
].
append
(
sample_size
)
self
.
_buffered_stats
[
'logging_outputs'
].
append
(
logging_output
)
self
.
_buffered_stats
[
'ooms_fwd'
].
append
(
oom_fwd
)
self
.
_buffered_stats
[
'ooms_bwd'
].
append
(
oom_bwd
)
# update parameters
if
update_params
and
not
last_step
:
# gather logging outputs from all replicas
sample_sizes
=
self
.
_buffered_stats
[
'sample_sizes'
]
logging_outputs
=
self
.
_buffered_stats
[
'logging_outputs'
]
ooms_fwd
=
self
.
_buffered_stats
[
'ooms_fwd'
]
ooms_bwd
=
self
.
_buffered_stats
[
'ooms_bwd'
]
if
self
.
args
.
distributed_world_size
>
1
:
sample_sizes
,
logging_outputs
,
ooms_fwd
,
ooms_bwd
=
map
(
lambda
l
:
list
(
chain
.
from_iterable
(
l
)),
zip
(
*
distributed_utils
.
all_gather_list
(
(
sample_sizes
,
logging_outputs
,
ooms_fwd
,
ooms_bwd
)
))
)
ooms_fwd
=
sum
(
ooms_fwd
)
ooms_bwd
=
sum
(
ooms_bwd
)
ooms
=
ooms_fwd
+
ooms_bwd
# this is always <= distributed_world_size
if
ooms
==
self
.
args
.
distributed_world_size
:
print
(
'| WARNING: OOM in all workers, skipping batch'
)
self
.
zero_grad
()
return
# aggregate stats and logging outputs
grad_denom
=
sum
(
sample_sizes
)
for
p
in
self
.
model
.
parameters
():
if
p
.
requires_grad
and
p
.
grad
is
not
None
:
p
.
grad
/=
grad_denom
self
.
_opt
()
# Handle logging
ntokens
=
sum
(
log
.
get
(
'ntokens'
,
0
)
for
log
in
logging_outputs
)
self
.
throughput_meter
.
update
(
ntokens
)
info_log_data
=
{
'tokens/s'
:
self
.
throughput_meter
.
avg
,
'tokens'
:
ntokens
,
'loss'
:
sum
(
log
.
get
(
'loss'
,
0
)
for
log
in
logging_outputs
)
/
ntokens
/
math
.
log
(
2
)
}
self
.
avg_loss_meter
.
update
(
info_log_data
[
'loss'
])
debug_log_data
=
{
'batch_size'
:
sum
(
log
.
get
(
'nsentences'
,
0
)
for
log
in
logging_outputs
),
'lr'
:
self
.
get_lr
(),
'grad_denom'
:
grad_denom
,
'updates'
:
1
}
DLLogger
.
log
(
step
=
self
.
_num_updates
,
data
=
info_log_data
,
verbosity
=
0
)
DLLogger
.
log
(
step
=
self
.
_num_updates
,
data
=
debug_log_data
,
verbosity
=
1
)
self
.
clear_buffered_stats
()
def
_forward
(
self
,
sample
):
loss
=
None
oom
=
0
try
:
if
sample
is
not
None
:
with
amp
.
autocast
(
enabled
=
self
.
args
.
amp
):
# calculate loss and sample size
logits
,
_
=
self
.
model
(
**
sample
[
'net_input'
])
target
=
sample
[
'target'
]
probs
=
F
.
log_softmax
(
logits
,
dim
=-
1
,
dtype
=
torch
.
float32
)
loss
=
self
.
criterion
(
probs
,
target
)
except
RuntimeError
as
e
:
if
'out of memory'
in
str
(
e
):
print
(
'| WARNING: ran out of memory in worker {}, skipping batch'
.
format
(
self
.
args
.
distributed_rank
),
force
=
True
)
oom
=
1
loss
=
None
else
:
raise
e
return
loss
,
oom
def
_backward
(
self
,
loss
):
oom
=
0
if
loss
is
not
None
:
try
:
self
.
scaler
.
scale
(
loss
).
backward
()
except
RuntimeError
as
e
:
if
'out of memory'
in
str
(
e
):
print
(
'| WARNING: ran out of memory in worker {}, skipping batch'
.
format
(
self
.
args
.
distributed_rank
),
force
=
True
)
oom
=
1
self
.
zero_grad
()
else
:
raise
e
return
oom
def
_opt
(
self
):
# take an optimization step
self
.
scaler
.
step
(
self
.
optimizer
.
optimizer
)
self
.
scaler
.
update
()
self
.
zero_grad
()
self
.
_num_updates
+=
1
# update learning rate
self
.
lr_scheduler
.
step_update
(
self
.
_num_updates
)
def
valid_step
(
self
,
sample
):
"""Do forward pass in evaluation mode."""
self
.
model
.
eval
()
# forward pass
sample
=
self
.
_prepare_sample
(
sample
)
with
torch
.
no_grad
():
loss
,
oom_fwd
=
self
.
_forward
(
sample
)
logging_output
=
{
'ntokens'
:
sample
[
'ntokens'
]
if
sample
is
not
None
else
0
,
'nsentences'
:
sample
[
'target'
].
size
(
0
)
if
sample
is
not
None
else
0
,
}
loss
=
loss
.
item
()
if
loss
is
not
None
else
0
assert
not
oom_fwd
,
'Ran out of memory during validation'
# gather logging outputs from all GPUs
if
self
.
args
.
distributed_world_size
>
1
:
losses
,
logging_outputs
=
zip
(
*
distributed_utils
.
all_gather_list
(
(
loss
,
logging_output
)
))
else
:
losses
=
[
loss
]
logging_outputs
=
[
logging_output
]
weight
=
sum
(
log
.
get
(
'ntokens'
,
0
)
for
log
in
logging_outputs
)
scaled_loss
=
sum
(
losses
)
/
weight
/
math
.
log
(
2
)
return
scaled_loss
def
dummy_train_step
(
self
,
dummy_batch
):
"""Dummy training step for warming caching allocator."""
self
.
train_step
(
dummy_batch
,
update_params
=
False
)
self
.
zero_grad
()
self
.
clear_buffered_stats
()
def
zero_grad
(
self
):
self
.
optimizer
.
zero_grad
()
def
clear_buffered_stats
(
self
):
self
.
_buffered_stats
.
clear
()
def
lr_step
(
self
,
epoch
,
val_loss
=
None
):
"""Adjust the learning rate based on the validation loss."""
return
self
.
lr_scheduler
.
step
(
epoch
,
val_loss
)
def
lr_step_update
(
self
,
num_updates
):
"""Update the learning rate after each update."""
return
self
.
lr_scheduler
.
step_update
(
num_updates
)
def
get_lr
(
self
):
"""Get the current learning rate."""
return
self
.
optimizer
.
get_lr
()
def
get_throughput_meter
(
self
):
"""Get the throughput meter"""
return
self
.
throughput_meter
def
get_model
(
self
):
"""Get the model replica."""
return
self
.
model
.
module
if
isinstance
(
self
.
model
,
DDP
)
else
self
.
model
def
get_num_updates
(
self
):
"""Get the number of parameters updates."""
return
self
.
_num_updates
def
_prepare_sample
(
self
,
sample
):
if
not
sample
:
return
None
return
utils
.
move_to_cuda
(
sample
)
PyTorch/NLP/Transformer/fairseq/distributed_utils.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
#-------------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
pickle
import
os
import
socket
import
torch.distributed
from
fairseq
import
utils
def
is_master
(
args
):
return
args
.
distributed_rank
==
0
def
distributed_init
(
args
):
if
args
.
distributed_world_size
==
1
:
raise
ValueError
(
'Cannot initialize distributed with distributed_world_size=1'
)
print
(
'| distributed init (rank {}): {}'
.
format
(
args
.
distributed_rank
,
args
.
distributed_init_method
),
flush
=
True
)
print
(
"| distributed env init. MASTER_ADDR: "
+
os
.
environ
[
'MASTER_ADDR'
]
+
", MASTER_PORT: "
+
os
.
environ
[
'MASTER_PORT'
]
+
", WORLD_SIZE: "
+
os
.
environ
[
'WORLD_SIZE'
]
+
", RANK: "
+
os
.
environ
[
'RANK'
],
flush
=
True
)
torch
.
distributed
.
init_process_group
(
backend
=
args
.
distributed_backend
,
init_method
=
'env://'
)
print
(
"| distributed init done!"
,
flush
=
True
)
args
.
distributed_world_size
=
int
(
os
.
environ
[
'WORLD_SIZE'
])
args
.
distributed_rank
=
torch
.
distributed
.
get_rank
()
args
.
device_id
=
int
(
os
.
environ
.
get
(
'LOCAL_RANK'
,
args
.
local_rank
))
suppress_output
(
args
)
print
(
'| initialized host {} as rank {} and device id {}'
.
format
(
socket
.
gethostname
(),
args
.
distributed_rank
,
args
.
device_id
))
return
args
.
distributed_rank
def
suppress_output
(
main_args
):
"""Suppress printing on the current device. Force printing with `force=True`."""
import
builtins
as
__builtin__
builtin_print
=
__builtin__
.
print
def
print_master
(
*
args
,
**
kwargs
):
if
'force'
in
kwargs
:
kwargs
.
pop
(
'force'
)
builtin_print
(
*
args
,
**
kwargs
)
def
print
(
*
args
,
**
kwargs
):
if
'force'
in
kwargs
:
force
=
kwargs
.
pop
(
'force'
)
if
force
:
builtin_print
(
*
args
,
**
kwargs
)
if
is_master
(
main_args
):
__builtin__
.
print
=
print_master
else
:
__builtin__
.
print
=
print
def
all_gather_list
(
data
,
max_size
=
16384
):
"""Gathers arbitrary data from all nodes into a list."""
world_size
=
torch
.
distributed
.
get_world_size
()
if
not
hasattr
(
all_gather_list
,
'_in_buffer'
)
or
\
max_size
!=
len
(
all_gather_list
.
_in_buffer
):
all_gather_list
.
_in_buffer
=
torch
.
cuda
.
ByteTensor
(
max_size
)
all_gather_list
.
_out_buffers
=
[
torch
.
cuda
.
ByteTensor
(
max_size
)
for
i
in
range
(
world_size
)
]
in_buffer
=
all_gather_list
.
_in_buffer
out_buffers
=
all_gather_list
.
_out_buffers
enc
=
pickle
.
dumps
(
data
)
enc_size
=
len
(
enc
)
if
enc_size
+
2
>
max_size
:
raise
ValueError
(
'encoded data exceeds max_size: {}'
.
format
(
enc_size
+
2
))
assert
max_size
<
255
*
256
in_buffer
[
0
]
=
enc_size
//
255
# this encoding works for max_size < 65k
in_buffer
[
1
]
=
enc_size
%
255
in_buffer
[
2
:
enc_size
+
2
]
=
torch
.
ByteTensor
(
list
(
enc
))
torch
.
distributed
.
all_gather
(
out_buffers
,
in_buffer
.
cuda
())
result
=
[]
for
i
in
range
(
world_size
):
out_buffer
=
out_buffers
[
i
]
size
=
(
255
*
utils
.
item
(
out_buffer
[
0
]))
+
utils
.
item
(
out_buffer
[
1
])
result
.
append
(
pickle
.
loads
(
bytes
(
out_buffer
[
2
:
size
+
2
].
tolist
()))
)
return
result
PyTorch/NLP/Transformer/fairseq/log_helper.py
deleted
100644 → 0
View file @
c056df78
import
os
import
atexit
import
time
import
itertools
from
collections
import
OrderedDict
import
dllogger
from
dllogger
import
Backend
,
JSONStreamBackend
from
tensorboardX
import
SummaryWriter
class
AverageMeter
():
def
__init__
(
self
):
self
.
reset
()
def
reset
(
self
):
self
.
updated
=
False
self
.
avg
=
0
self
.
sum
=
0
self
.
count
=
0
def
update
(
self
,
value
):
self
.
updated
=
True
if
isinstance
(
value
,
(
tuple
,
list
)):
val
=
value
[
0
]
n
=
value
[
1
]
else
:
val
=
value
n
=
1
self
.
sum
+=
val
*
n
self
.
count
+=
n
self
.
avg
=
self
.
sum
/
self
.
count
@
property
def
value
(
self
):
return
self
.
avg
class
PerformanceMeter
():
def
__init__
(
self
):
self
.
reset
()
def
reset
(
self
):
self
.
updated
=
False
self
.
start
=
time
.
time
()
self
.
n
=
0
def
update
(
self
,
val
=
1
):
self
.
updated
=
True
self
.
n
+=
val
@
property
def
value
(
self
):
return
self
.
n
/
self
.
elapsed_time
@
property
def
elapsed_time
(
self
):
return
time
.
time
()
-
self
.
start
METRIC
=
{
'average'
:
AverageMeter
,
'performance'
:
PerformanceMeter
}
class
AggregatorBackend
(
Backend
):
def
__init__
(
self
,
verbosity
,
agg_dict
):
super
().
__init__
(
verbosity
=
verbosity
)
agg_dict
=
OrderedDict
({
k
:
v
if
isinstance
(
v
,
(
tuple
,
list
))
else
(
v
,)
for
k
,
v
in
agg_dict
.
items
()})
self
.
metrics
=
OrderedDict
({
k
:
[
METRIC
[
x
]()
for
x
in
v
]
for
k
,
v
in
agg_dict
.
items
()})
self
.
metrics
.
flushed
=
True
self
.
step
=
0
self
.
epoch
=
0
self
.
start_time
=
time
.
time
()
@
property
def
log_level
(
self
):
return
self
.
_log_level
def
metadata
(
self
,
timestamp
,
elapsedtime
,
metric
,
metadata
):
pass
def
_reset_perf_meter
(
self
,
name
):
for
agg
in
self
.
metrics
[
name
]:
if
isinstance
(
agg
,
PerformanceMeter
):
agg
.
reset
()
def
reset_perf_meters
(
self
):
for
name
in
self
.
metrics
.
keys
():
self
.
_reset_perf_meter
(
name
)
def
log
(
self
,
timestamp
,
elapsedtime
,
step
,
data
):
self
.
step
=
step
if
'epoch'
in
data
.
keys
():
self
.
epoch
=
data
[
'epoch'
]
for
k
,
v
in
data
.
items
():
if
k
not
in
self
.
metrics
.
keys
():
continue
self
.
metrics
.
flushed
=
False
for
ag
in
self
.
metrics
[
k
]:
ag
.
update
(
v
)
def
flush
(
self
):
if
self
.
metrics
.
flushed
:
return
result_string
=
'Transformer | epoch {} | step {} |'
.
format
(
self
.
epoch
,
self
.
step
)
for
name
,
aggregators
in
self
.
metrics
.
items
():
for
agg
in
aggregators
:
if
not
agg
.
updated
:
continue
if
isinstance
(
agg
,
AverageMeter
):
_name
=
'avg '
+
name
elif
isinstance
(
agg
,
PerformanceMeter
):
_name
=
name
+
'/s'
result_string
+=
_name
+
' {:.3f} |'
.
format
(
agg
.
value
)
agg
.
reset
()
result_string
+=
'walltime {:.3f} |'
.
format
(
time
.
time
()
-
self
.
start_time
)
self
.
metrics
.
flushed
=
True
print
(
result_string
)
class
TensorBoardBackend
(
Backend
):
def
__init__
(
self
,
verbosity
,
log_dir
):
super
().
__init__
(
verbosity
=
verbosity
)
self
.
summary_writer
=
SummaryWriter
(
log_dir
=
os
.
path
.
join
(
log_dir
,
'TB_summary'
),
flush_secs
=
120
,
max_queue
=
200
)
atexit
.
register
(
self
.
summary_writer
.
close
)
@
property
def
log_level
(
self
):
return
self
.
_log_level
def
metadata
(
self
,
timestamp
,
elapsedtime
,
metric
,
metadata
):
pass
def
log
(
self
,
timestamp
,
elapsedtime
,
step
,
data
):
if
not
isinstance
(
step
,
int
):
return
for
k
,
v
in
data
.
items
():
self
.
summary_writer
.
add_scalar
(
k
,
v
,
step
)
def
flush
(
self
):
pass
def
setup_logger
(
args
):
aggregator_dict
=
OrderedDict
([
(
'loss'
,
'average'
),
(
'weighted_loss'
,
'average'
),
(
'tokens'
,
(
'average'
,
'performance'
)),
(
'updates'
,
'performance'
),
(
'gnorm'
,
'average'
)
])
os
.
makedirs
(
args
.
save_dir
,
exist_ok
=
True
)
log_path
=
os
.
path
.
join
(
args
.
save_dir
,
args
.
stat_file
)
if
os
.
path
.
exists
(
log_path
):
for
i
in
itertools
.
count
():
s_fname
=
args
.
stat_file
.
split
(
'.'
)
fname
=
'.'
.
join
(
s_fname
[:
-
1
])
+
f
'_
{
i
}
.'
+
s_fname
[
-
1
]
if
len
(
s_fname
)
>
1
else
args
.
stat_file
+
f
'.
{
i
}
'
log_path
=
os
.
path
.
join
(
args
.
save_dir
,
fname
)
if
not
os
.
path
.
exists
(
log_path
):
break
if
not
args
.
distributed_world_size
>
1
or
args
.
distributed_rank
==
0
:
dllogger
.
init
(
backends
=
[
JSONStreamBackend
(
verbosity
=
1
,
filename
=
log_path
),
AggregatorBackend
(
verbosity
=
0
,
agg_dict
=
aggregator_dict
),
TensorBoardBackend
(
verbosity
=
1
,
log_dir
=
args
.
save_dir
)])
else
:
dllogger
.
init
(
backends
=
[])
for
k
,
v
in
vars
(
args
).
items
():
dllogger
.
log
(
step
=
'PARAMETER'
,
data
=
{
k
:
v
},
verbosity
=
0
)
container_setup_info
=
get_framework_env_vars
()
dllogger
.
log
(
step
=
'PARAMETER'
,
data
=
container_setup_info
,
verbosity
=
0
)
dllogger
.
metadata
(
'loss'
,
{
'unit'
:
'nat'
,
'GOAL'
:
'MINIMIZE'
,
'STAGE'
:
'TRAIN'
})
dllogger
.
metadata
(
'val_loss'
,
{
'unit'
:
'nat'
,
'GOAL'
:
'MINIMIZE'
,
'STAGE'
:
'VAL'
})
dllogger
.
metadata
(
'speed'
,
{
'unit'
:
'tokens/s'
,
'format'
:
':.3f'
,
'GOAL'
:
'MAXIMIZE'
,
'STAGE'
:
'TRAIN'
})
dllogger
.
metadata
(
'accuracy'
,
{
'unit'
:
'bleu'
,
'format'
:
':.2f'
,
'GOAL'
:
'MAXIMIZE'
,
'STAGE'
:
'VAL'
})
def
get_framework_env_vars
():
return
{
'NVIDIA_PYTORCH_VERSION'
:
os
.
environ
.
get
(
'NVIDIA_PYTORCH_VERSION'
),
'PYTORCH_VERSION'
:
os
.
environ
.
get
(
'PYTORCH_VERSION'
),
'CUBLAS_VERSION'
:
os
.
environ
.
get
(
'CUBLAS_VERSION'
),
'NCCL_VERSION'
:
os
.
environ
.
get
(
'NCCL_VERSION'
),
'CUDA_DRIVER_VERSION'
:
os
.
environ
.
get
(
'CUDA_DRIVER_VERSION'
),
'CUDNN_VERSION'
:
os
.
environ
.
get
(
'CUDNN_VERSION'
),
'CUDA_VERSION'
:
os
.
environ
.
get
(
'CUDA_VERSION'
),
'NVIDIA_PIPELINE_ID'
:
os
.
environ
.
get
(
'NVIDIA_PIPELINE_ID'
),
'NVIDIA_BUILD_ID'
:
os
.
environ
.
get
(
'NVIDIA_BUILD_ID'
),
'NVIDIA_TF32_OVERRIDE'
:
os
.
environ
.
get
(
'NVIDIA_TF32_OVERRIDE'
),
}
def
reset_perf_meters
():
for
backend
in
dllogger
.
GLOBAL_LOGGER
.
backends
:
if
isinstance
(
backend
,
AggregatorBackend
):
backend
.
reset_perf_meters
()
PyTorch/NLP/Transformer/fairseq/meters.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
time
class
AverageMeter
(
object
):
"""Computes and stores the average and current value"""
def
__init__
(
self
):
self
.
reset
()
def
reset
(
self
):
self
.
val
=
0
self
.
avg
=
0
self
.
sum
=
0
self
.
count
=
0
def
update
(
self
,
val
,
n
=
1
):
self
.
val
=
val
self
.
sum
+=
val
*
n
self
.
count
+=
n
self
.
avg
=
self
.
sum
/
self
.
count
class
TimeMeter
(
object
):
"""Computes the average occurrence of some event per second"""
def
__init__
(
self
,
init
=
0
):
self
.
reset
(
init
)
def
reset
(
self
,
init
=
0
):
self
.
init
=
init
self
.
start
=
time
.
time
()
self
.
n
=
0
self
.
last_update
=
time
.
time
()
def
update
(
self
,
val
=
1
):
self
.
n
+=
val
self
.
last_update
=
time
.
time
()
@
property
def
avg
(
self
):
return
self
.
n
/
self
.
elapsed_time
@
property
def
elapsed_time
(
self
):
return
self
.
init
+
(
time
.
time
()
-
self
.
start
)
@
property
def
u_avg
(
self
):
return
self
.
n
/
(
self
.
last_update
-
self
.
start
)
class
StopwatchMeter
(
object
):
"""Computes the sum/avg duration of some event in seconds"""
def
__init__
(
self
):
self
.
reset
()
self
.
intervals
=
[]
def
start
(
self
):
self
.
start_time
=
time
.
time
()
def
stop
(
self
,
n
=
1
):
if
self
.
start_time
is
not
None
:
delta
=
time
.
time
()
-
self
.
start_time
self
.
intervals
.
append
(
delta
)
self
.
sum
+=
delta
self
.
n
+=
n
self
.
start_time
=
None
def
reset
(
self
):
self
.
sum
=
0
self
.
n
=
0
self
.
start_time
=
None
self
.
intervals
=
[]
@
property
def
avg
(
self
):
return
self
.
sum
/
self
.
n
def
p
(
self
,
i
):
assert
i
<=
100
idx
=
int
(
len
(
self
.
intervals
)
*
i
/
100
)
return
sorted
(
self
.
intervals
)[
idx
]
PyTorch/NLP/Transformer/fairseq/models/__init__.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
importlib
import
os
from
.fairseq_incremental_decoder
import
FairseqIncrementalDecoder
# noqa: F401
MODEL_REGISTRY
=
{}
ARCH_MODEL_REGISTRY
=
{}
ARCH_CONFIG_REGISTRY
=
{}
def
build_model
(
args
):
return
ARCH_MODEL_REGISTRY
[
args
.
arch
].
build_model
(
args
)
def
register_model
(
name
):
"""Decorator to register a new model (e.g., LSTM)."""
def
register_model_cls
(
cls
):
if
name
in
MODEL_REGISTRY
:
raise
ValueError
(
'Cannot register duplicate model ({})'
.
format
(
name
))
MODEL_REGISTRY
[
name
]
=
cls
return
cls
return
register_model_cls
def
register_model_architecture
(
model_name
,
arch_name
):
"""Decorator to register a new model architecture (e.g., lstm_luong_wmt_en_de)."""
def
register_model_arch_fn
(
fn
):
if
model_name
not
in
MODEL_REGISTRY
:
raise
ValueError
(
'Cannot register model architecture for unknown model type ({})'
.
format
(
model_name
))
if
arch_name
in
ARCH_MODEL_REGISTRY
:
raise
ValueError
(
'Cannot register duplicate model architecture ({})'
.
format
(
arch_name
))
if
not
callable
(
fn
):
raise
ValueError
(
'Model architecture must be callable ({})'
.
format
(
arch_name
))
ARCH_MODEL_REGISTRY
[
arch_name
]
=
MODEL_REGISTRY
[
model_name
]
ARCH_CONFIG_REGISTRY
[
arch_name
]
=
fn
return
fn
return
register_model_arch_fn
# automatically import any Python files in the models/ directory
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
'.py'
)
and
not
file
.
startswith
(
'_'
):
module
=
file
[:
file
.
find
(
'.py'
)]
importlib
.
import_module
(
'fairseq.models.'
+
module
)
PyTorch/NLP/Transformer/fairseq/models/fairseq_incremental_decoder.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
torch.nn
as
nn
class
FairseqIncrementalDecoder
(
nn
.
Module
):
"""Base class for incremental decoders."""
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
prev_output_tokens
,
encoder_out
,
incremental_state
=
None
):
raise
NotImplementedError
def
reorder_incremental_state
(
self
,
incremental_state
,
new_order
):
"""Reorder incremental state.
This should be called when the order of the input has changed from the
previous time step. A typical use case is beam search, where the input
order changes between time steps based on the selection of beams.
"""
def
apply_reorder_incremental_state
(
module
):
if
module
!=
self
and
hasattr
(
module
,
'reorder_incremental_state'
):
module
.
reorder_incremental_state
(
incremental_state
,
new_order
,
)
self
.
apply
(
apply_reorder_incremental_state
)
def
set_beam_size
(
self
,
beam_size
):
"""Sets the beam size in the decoder and all children."""
if
getattr
(
self
,
'_beam_size'
,
-
1
)
!=
beam_size
:
def
apply_set_beam_size
(
module
):
if
module
!=
self
and
hasattr
(
module
,
'set_beam_size'
):
module
.
set_beam_size
(
beam_size
)
self
.
apply
(
apply_set_beam_size
)
self
.
_beam_size
=
beam_size
PyTorch/NLP/Transformer/fairseq/models/fused_layer_norm.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.import math
import
math
import
torch
import
numbers
from
torch.nn.parameter
import
Parameter
from
torch.nn
import
init
import
fused_layer_norm_cuda
class
FusedLayerNormAffineFunction
(
torch
.
autograd
.
Function
):
def
__init__
(
self
,
normalized_shape
,
eps
=
1e-6
):
self
.
normalized_shape
=
normalized_shape
self
.
eps
=
eps
def
forward
(
self
,
input
,
weight
,
bias
):
input_
=
input
.
contiguous
()
weight_
=
weight
.
contiguous
()
bias_
=
bias
.
contiguous
()
output
,
mean
,
invvar
=
fused_layer_norm_cuda
.
forward_affine
(
input_
,
self
.
normalized_shape
,
weight_
,
bias_
,
self
.
eps
)
self
.
save_for_backward
(
input_
,
weight_
,
bias_
,
mean
,
invvar
)
return
output
def
backward
(
self
,
grad_output
):
input_
,
weight_
,
bias_
,
mean
,
invvar
=
self
.
saved_tensors
grad_input
=
grad_weight
=
grad_bias
=
None
grad_input
,
grad_weight
,
grad_bias
=
fused_layer_norm_cuda
.
backward_affine
(
grad_output
.
contiguous
(),
mean
,
invvar
,
input_
,
self
.
normalized_shape
,
weight_
,
bias_
,
self
.
eps
)
return
grad_input
,
grad_weight
,
grad_bias
;
class
FusedLayerNormFunction
(
torch
.
autograd
.
Function
):
def
__init__
(
self
,
normalized_shape
,
eps
=
1e-6
):
self
.
normalized_shape
=
normalized_shape
self
.
eps
=
eps
def
forward
(
self
,
input
):
input_
=
input
.
contiguous
()
output
,
mean
,
invvar
=
fused_layer_norm_cuda
.
forward
(
input_
,
self
.
normalized_shape
,
self
.
eps
)
self
.
save_for_backward
(
input_
,
mean
,
invvar
)
return
output
def
backward
(
self
,
grad_output
):
input_
,
mean
,
invvar
=
self
.
saved_tensors
grad_input
=
None
grad_input
=
fused_layer_norm_cuda
.
backward
(
grad_output
.
contiguous
(),
mean
,
invvar
,
input_
,
self
.
normalized_shape
,
self
.
eps
)
return
grad_input
def
fused_layer_norm_affine
(
input
,
normalized_shape
,
weight
,
bias
,
eps
=
1e-6
):
return
FusedLayerNormAffineFunction
(
normalized_shape
,
eps
)(
input
,
weight
,
bias
)
def
fused_layer_norm
(
input
,
normalized_shape
,
eps
=
1e-6
):
return
FusedLayerNormFunction
(
normalized_shape
,
eps
)(
input
)
class
FusedLayerNorm
(
torch
.
nn
.
Module
):
r
"""Applies Layer Normalization over a mini-batch of inputs as described in
the paper `Layer Normalization`_ .
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
The mean and standard-deviation are calculated separately over the last
certain number dimensions which have to be of the shape specified by
:attr:`normalized_shape`.
:math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
:attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
.. note::
Unlike Batch Normalization and Instance Normalization, which applies
scalar scale and bias for each entire channel/plane with the
:attr:`affine` option, Layer Normalization applies per-element scale and
bias with :attr:`elementwise_affine`.
This layer uses statistics computed from input data in both training and
evaluation modes.
Args:
normalized_shape (int or list or torch.Size): input shape from an expected input
of size
.. math::
[* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
\times \ldots \times \text{normalized\_shape}[-1]]
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps: a value added to the denominator for numerical stability. Default: 1e-5
elementwise_affine: a boolean value that when set to ``True``, this module
has learnable per-element affine parameters initialized to ones (for weights)
and zeros (for biases). Default: ``True``.
Shape:
- Input: :math:`(N, *)`
- Output: :math:`(N, *)` (same shape as input)
Examples::
>>> input = torch.randn(20, 5, 10, 10)
>>> # With Learnable Parameters
>>> m = nn.LayerNorm(input.size()[1:])
>>> # Without Learnable Parameters
>>> m = nn.LayerNorm(input.size()[1:], elementwise_affine=False)
>>> # Normalize over last two dimensions
>>> m = nn.LayerNorm([10, 10])
>>> # Normalize over last dimension of size 10
>>> m = nn.LayerNorm(10)
>>> # Activating the module
>>> output = m(input)
.. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
"""
def
__init__
(
self
,
normalized_shape
,
eps
=
1e-5
,
elementwise_affine
=
True
):
super
(
FusedLayerNorm
,
self
).
__init__
()
if
isinstance
(
normalized_shape
,
numbers
.
Integral
):
normalized_shape
=
(
normalized_shape
,)
self
.
normalized_shape
=
torch
.
Size
(
normalized_shape
)
self
.
eps
=
eps
self
.
elementwise_affine
=
elementwise_affine
if
self
.
elementwise_affine
:
self
.
weight
=
Parameter
(
torch
.
Tensor
(
*
normalized_shape
))
self
.
bias
=
Parameter
(
torch
.
Tensor
(
*
normalized_shape
))
else
:
self
.
register_parameter
(
'weight'
,
None
)
self
.
register_parameter
(
'bias'
,
None
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
if
self
.
elementwise_affine
:
init
.
ones_
(
self
.
weight
)
init
.
zeros_
(
self
.
bias
)
def
forward
(
self
,
input
):
if
self
.
elementwise_affine
:
return
FusedLayerNormAffineFunction
(
self
.
normalized_shape
,
self
.
eps
)(
input
,
self
.
weight
,
self
.
bias
)
else
:
return
FusedLayerNormFunction
(
self
.
normalized_shape
,
self
.
eps
)(
input
)
def
extra_repr
(
self
):
return
'{normalized_shape}, eps={eps}, '
\
'elementwise_affine={elementwise_affine}'
.
format
(
**
self
.
__dict__
)
PyTorch/NLP/Transformer/fairseq/models/transformer.py
deleted
100644 → 0
View file @
c056df78
This diff is collapsed.
Click to expand it.
PyTorch/NLP/Transformer/fairseq/modules/__init__.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from
.beamable_mm
import
BeamableMM
from
.learned_positional_embedding
import
LearnedPositionalEmbedding
from
.multihead_attention
import
MultiheadAttention
from
.sinusoidal_positional_embedding
import
SinusoidalPositionalEmbedding
__all__
=
[
'BeamableMM'
,
'LearnedPositionalEmbedding'
,
'MultiheadAttention'
,
'SinusoidalPositionalEmbedding'
,
]
PyTorch/NLP/Transformer/fairseq/modules/learned_positional_embedding.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
torch.nn
as
nn
from
fairseq
import
utils
class
LearnedPositionalEmbedding
(
nn
.
Embedding
):
"""This module learns positional embeddings up to a fixed maximum size.
Padding symbols are ignored, but it is necessary to specify whether padding
is added on the left side (left_pad=True) or right side (left_pad=False).
"""
def
__init__
(
self
,
num_embeddings
,
embedding_dim
,
padding_idx
,
left_pad
):
super
().
__init__
(
num_embeddings
,
embedding_dim
,
padding_idx
)
self
.
left_pad
=
left_pad
def
forward
(
self
,
input
,
incremental_state
=
None
):
"""Input is expected to be of size [bsz x seqlen]."""
if
incremental_state
is
not
None
:
# positions is the same for every token when decoding a single step
positions
=
input
.
data
.
new
(
1
,
1
).
fill_
(
self
.
padding_idx
+
input
.
size
(
1
))
else
:
positions
=
utils
.
make_positions
(
input
.
data
,
self
.
padding_idx
,
self
.
left_pad
)
return
super
().
forward
(
positions
)
PyTorch/NLP/Transformer/fairseq/modules/multihead_attention.py
deleted
100644 → 0
View file @
c056df78
This diff is collapsed.
Click to expand it.
PyTorch/NLP/Transformer/fairseq/modules/strided_batched_gemm/strided_batched_gemm.cpp
deleted
100644 → 0
View file @
c056df78
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/torch.h>
#include <vector>
at
::
Tensor
strided_batched_gemm_cuda
(
float
beta
,
at
::
Tensor
in_result
,
float
alpha
,
at
::
Tensor
batch1
,
at
::
Tensor
batch2
);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
at
::
Tensor
strided_batched_gemm
(
float
beta
,
at
::
Tensor
in_result
,
float
alpha
,
at
::
Tensor
batch1
,
at
::
Tensor
batch2
)
{
//CHECK_INPUT(in_result);
//CHECK_INPUT(batch1);
//CHECK_INPUT(batch2);
AT_ASSERTM
(
in_result
.
dim
()
==
3
,
"expected 3D tensor"
);
AT_ASSERTM
(
batch1
.
dim
()
==
3
,
"expected 3D tensor"
);
AT_ASSERTM
(
batch2
.
dim
()
==
3
,
"expected 3D tensor"
);
AT_ASSERTM
(
in_result
.
size
(
0
)
==
batch1
.
size
(
0
),
"equal number of batches expected"
);
AT_ASSERTM
(
in_result
.
size
(
0
)
==
batch2
.
size
(
0
),
"equal number of batches expected"
);
AT_ASSERTM
(
in_result
.
size
(
1
)
==
batch1
.
size
(
1
),
"wrong matrix size"
);
AT_ASSERTM
(
in_result
.
size
(
2
)
==
batch2
.
size
(
2
),
"wrong matrix size"
);
AT_ASSERTM
(
batch1
.
size
(
2
)
==
batch2
.
size
(
1
),
"wrong matrix size"
);
AT_ASSERTM
(
batch1
.
type
().
scalarType
()
==
at
::
ScalarType
::
Half
,
"Only HALF is supported"
);
AT_ASSERTM
(
batch2
.
type
().
scalarType
()
==
at
::
ScalarType
::
Half
,
"Only HALF is supported"
);
AT_ASSERTM
(
in_result
.
type
().
scalarType
()
==
at
::
ScalarType
::
Half
,
"Only HALF is supported"
);
return
strided_batched_gemm_cuda
(
beta
,
in_result
,
alpha
,
batch1
,
batch2
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"strided_batched_gemm"
,
&
strided_batched_gemm
,
"Special strided batched gemm."
);
}
PyTorch/NLP/Transformer/fairseq/modules/strided_batched_gemm/strided_batched_gemm_cuda.cu
deleted
100644 → 0
View file @
c056df78
This diff is collapsed.
Click to expand it.
PyTorch/NLP/Transformer/fairseq/optim/__init__.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
importlib
import
os
from
.fairseq_optimizer
import
FairseqOptimizer
OPTIMIZER_REGISTRY
=
{}
OPTIMIZER_CLASS_NAMES
=
set
()
def
build_optimizer
(
args
,
params
):
params
=
filter
(
lambda
p
:
p
.
requires_grad
,
params
)
return
OPTIMIZER_REGISTRY
[
args
.
optimizer
](
args
,
params
)
def
register_optimizer
(
name
):
"""Decorator to register a new optimizer."""
def
register_optimizer_cls
(
cls
):
if
name
in
OPTIMIZER_REGISTRY
:
raise
ValueError
(
'Cannot register duplicate optimizer ({})'
.
format
(
name
))
if
not
issubclass
(
cls
,
FairseqOptimizer
):
raise
ValueError
(
'Optimizer ({}: {}) must extend FairseqOptimizer'
.
format
(
name
,
cls
.
__name__
))
if
cls
.
__name__
in
OPTIMIZER_CLASS_NAMES
:
# We use the optimizer class name as a unique identifier in
# checkpoints, so all optimizer must have unique class names.
raise
ValueError
(
'Cannot register optimizer with duplicate class name ({})'
.
format
(
cls
.
__name__
))
OPTIMIZER_REGISTRY
[
name
]
=
cls
OPTIMIZER_CLASS_NAMES
.
add
(
cls
.
__name__
)
return
cls
return
register_optimizer_cls
# automatically import any Python files in the optim/ directory
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
'.py'
)
and
not
file
.
startswith
(
'_'
):
module
=
file
[:
file
.
find
(
'.py'
)]
importlib
.
import_module
(
'fairseq.optim.'
+
module
)
PyTorch/NLP/Transformer/fairseq/optim/adam.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
#-------------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.
import
FairseqOptimizer
,
register_optimizer
from
apex.optimizers.fused_adam
import
FusedAdam
@
register_optimizer
(
'adam'
)
class
FairseqAdam
(
FairseqOptimizer
):
def
__init__
(
self
,
args
,
params
):
super
().
__init__
(
args
,
params
)
self
.
_optimizer
=
FusedAdam
(
params
,
**
self
.
optimizer_config
)
@
staticmethod
def
add_args
(
parser
):
"""Add optimizer-specific arguments to the parser."""
parser
.
add_argument
(
'--adam-betas'
,
default
=
(
0.9
,
0.999
),
nargs
=
2
,
type
=
float
,
metavar
=
'B1 B2'
,
help
=
'betas for Adam optimizer'
)
parser
.
add_argument
(
'--adam-eps'
,
type
=
float
,
default
=
1e-8
,
metavar
=
'D'
,
help
=
'epsilon for Adam optimizer'
)
@
property
def
optimizer_config
(
self
):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
return
{
'lr'
:
self
.
args
.
lr
[
0
],
'betas'
:
self
.
args
.
adam_betas
,
'eps'
:
self
.
args
.
adam_eps
,
'weight_decay'
:
self
.
args
.
weight_decay
,
}
PyTorch/NLP/Transformer/fairseq/optim/fairseq_optimizer.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
#-------------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch.optim
class
FairseqOptimizer
(
object
):
def
__init__
(
self
,
args
,
params
):
super
().
__init__
()
self
.
args
=
args
self
.
params
=
params
@
staticmethod
def
add_args
(
parser
):
"""Add optimizer-specific arguments to the parser."""
pass
@
property
def
optimizer
(
self
):
"""Return a torch.optim.optimizer.Optimizer instance."""
if
not
hasattr
(
self
,
'_optimizer'
):
raise
NotImplementedError
if
not
isinstance
(
self
.
_optimizer
,
torch
.
optim
.
Optimizer
):
raise
ValueError
(
'_optimizer must be an instance of torch.optim.Optimizer'
)
return
self
.
_optimizer
@
property
def
optimizer_config
(
self
):
"""
Return a kwarg dictionary that will be used to override optimizer
args stored in checkpoints. This allows us to load a checkpoint and
resume training using a different set of optimizer args, e.g., with a
different learning rate.
"""
raise
NotImplementedError
def
get_lr
(
self
):
"""Return the current learning rate."""
return
self
.
optimizer
.
param_groups
[
0
][
'lr'
]
def
set_lr
(
self
,
lr
):
"""Set the learning rate."""
for
param_group
in
self
.
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr
def
state_dict
(
self
):
"""Return the optimizer's state dict."""
return
self
.
optimizer
.
state_dict
()
def
load_state_dict
(
self
,
state_dict
):
"""Load an optimizer state dict.
In general we should prefer the configuration of the existing optimizer
instance (e.g., learning rate) over that found in the state_dict. This
allows us to resume training from a checkpoint using a new set of
optimizer args.
"""
self
.
optimizer
.
load_state_dict
(
state_dict
)
# override learning rate, momentum, etc. with latest values
for
group
in
self
.
optimizer
.
param_groups
:
group
.
update
(
self
.
optimizer_config
)
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step."""
return
self
.
optimizer
.
step
(
closure
)
def
zero_grad
(
self
):
"""Clears the gradients of all optimized parameters."""
for
group
in
self
.
optimizer
.
param_groups
:
for
p
in
group
[
'params'
]:
p
.
grad
=
None
return
self
.
optimizer
.
zero_grad
()
PyTorch/NLP/Transformer/fairseq/optim/lr_scheduler/__init__.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
importlib
import
os
from
.fairseq_lr_scheduler
import
FairseqLRScheduler
LR_SCHEDULER_REGISTRY
=
{}
def
build_lr_scheduler
(
args
,
optimizer
):
return
LR_SCHEDULER_REGISTRY
[
args
.
lr_scheduler
](
args
,
optimizer
)
def
register_lr_scheduler
(
name
):
"""Decorator to register a new LR scheduler."""
def
register_lr_scheduler_cls
(
cls
):
if
name
in
LR_SCHEDULER_REGISTRY
:
raise
ValueError
(
'Cannot register duplicate LR scheduler ({})'
.
format
(
name
))
if
not
issubclass
(
cls
,
FairseqLRScheduler
):
raise
ValueError
(
'LR Scheduler ({}: {}) must extend FairseqLRScheduler'
.
format
(
name
,
cls
.
__name__
))
LR_SCHEDULER_REGISTRY
[
name
]
=
cls
return
cls
return
register_lr_scheduler_cls
# automatically import any Python files in the optim/lr_scheduler/ directory
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
'.py'
)
and
not
file
.
startswith
(
'_'
):
module
=
file
[:
file
.
find
(
'.py'
)]
importlib
.
import_module
(
'fairseq.optim.lr_scheduler.'
+
module
)
PyTorch/NLP/Transformer/fairseq/optim/lr_scheduler/fixed_schedule.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
from
.
import
FairseqLRScheduler
,
register_lr_scheduler
@
register_lr_scheduler
(
'fixed'
)
class
FixedSchedule
(
FairseqLRScheduler
):
"""Decay the LR on a fixed schedule."""
def
__init__
(
self
,
args
,
optimizer
):
super
().
__init__
(
args
,
optimizer
)
# set defaults
args
.
warmup_updates
=
getattr
(
args
,
'warmup_updates'
,
0
)
or
0
self
.
lr
=
args
.
lr
[
0
]
if
args
.
warmup_updates
>
0
:
self
.
warmup_factor
=
1.
/
args
.
warmup_updates
else
:
self
.
warmup_factor
=
1
@
staticmethod
def
add_args
(
parser
):
"""Add arguments to the parser for this LR scheduler."""
parser
.
add_argument
(
'--force-anneal'
,
'--fa'
,
type
=
int
,
metavar
=
'N'
,
help
=
'force annealing at specified epoch'
)
parser
.
add_argument
(
'--warmup-updates'
,
default
=
0
,
type
=
int
,
metavar
=
'N'
,
help
=
'warmup the learning rate linearly for the first N updates'
)
def
get_next_lr
(
self
,
epoch
):
lrs
=
self
.
args
.
lr
if
self
.
args
.
force_anneal
is
None
or
epoch
<
self
.
args
.
force_anneal
:
# use fixed LR schedule
next_lr
=
lrs
[
min
(
epoch
,
len
(
lrs
)
-
1
)]
else
:
# annneal based on lr_shrink
next_lr
=
lrs
[
-
1
]
*
self
.
args
.
lr_shrink
**
(
epoch
+
1
-
self
.
args
.
force_anneal
)
return
next_lr
def
step
(
self
,
epoch
,
val_loss
=
None
):
"""Update the learning rate at the end of the given epoch."""
super
().
step
(
epoch
,
val_loss
)
self
.
lr
=
self
.
get_next_lr
(
epoch
)
self
.
optimizer
.
set_lr
(
self
.
warmup_factor
*
self
.
lr
)
return
self
.
optimizer
.
get_lr
()
def
step_update
(
self
,
num_updates
):
"""Update the learning rate after each update."""
if
self
.
args
.
warmup_updates
>
0
and
num_updates
<=
self
.
args
.
warmup_updates
:
self
.
warmup_factor
=
num_updates
/
float
(
self
.
args
.
warmup_updates
)
self
.
optimizer
.
set_lr
(
self
.
warmup_factor
*
self
.
lr
)
return
self
.
optimizer
.
get_lr
()
PyTorch/NLP/Transformer/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
import
torch.optim.lr_scheduler
from
.
import
FairseqLRScheduler
,
register_lr_scheduler
@
register_lr_scheduler
(
'reduce_lr_on_plateau'
)
class
ReduceLROnPlateau
(
FairseqLRScheduler
):
"""Decay the LR by a factor every time the validation loss plateaus."""
def
__init__
(
self
,
args
,
optimizer
):
super
().
__init__
(
args
,
optimizer
)
if
len
(
args
.
lr
)
>
1
:
raise
ValueError
(
'Cannot use a fixed learning rate schedule with reduce_lr_on_plateau.'
' Consider --lr-scheduler=fixed instead.'
)
self
.
lr_scheduler
=
torch
.
optim
.
lr_scheduler
.
ReduceLROnPlateau
(
self
.
optimizer
.
optimizer
,
patience
=
0
,
factor
=
args
.
lr_shrink
)
def
state_dict
(
self
):
"""Return the LR scheduler state dict."""
return
{
'best'
:
self
.
lr_scheduler
.
best
,
'last_epoch'
:
self
.
lr_scheduler
.
last_epoch
,
}
def
load_state_dict
(
self
,
state_dict
):
"""Load an LR scheduler state dict."""
self
.
lr_scheduler
.
best
=
state_dict
[
'best'
]
if
'last_epoch'
in
state_dict
:
self
.
lr_scheduler
.
last_epoch
=
state_dict
[
'last_epoch'
]
def
step
(
self
,
epoch
,
val_loss
=
None
):
"""Update the learning rate at the end of the given epoch."""
if
val_loss
is
not
None
:
self
.
lr_scheduler
.
step
(
val_loss
,
epoch
)
else
:
self
.
lr_scheduler
.
last_epoch
=
epoch
return
self
.
optimizer
.
get_lr
()
PyTorch/NLP/Transformer/fairseq/options.py
deleted
100644 → 0
View file @
c056df78
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
#
#-------------------------------------------------------------------------
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
import
torch
from
fairseq.models
import
ARCH_MODEL_REGISTRY
,
ARCH_CONFIG_REGISTRY
from
fairseq.criterions
import
CRITERION_REGISTRY
from
fairseq.optim
import
OPTIMIZER_REGISTRY
from
fairseq.optim.lr_scheduler
import
LR_SCHEDULER_REGISTRY
def
get_training_parser
():
parser
=
get_parser
(
'Trainer'
)
add_dataset_args
(
parser
,
train
=
True
,
gen
=
True
)
add_distributed_training_args
(
parser
)
add_model_args
(
parser
)
add_optimization_args
(
parser
)
add_checkpoint_args
(
parser
)
add_inference_args
(
parser
)
add_perf_args
(
parser
)
return
parser
def
get_inference_parser
():
parser
=
get_parser
(
'Generation'
)
add_dataset_args
(
parser
,
gen
=
True
)
add_inference_args
(
parser
)
add_perf_args
(
parser
)
return
parser
def
parse_args_and_arch
(
parser
,
input_args
=
None
,
parse_known
=
False
):
# The parser doesn't know about model/criterion/optimizer-specific args, so
# we parse twice. First we parse the model/criterion/optimizer, then we
# parse a second time after adding the *-specific arguments.
# If input_args is given, we will parse those args instead of sys.argv.
args
,
_
=
parser
.
parse_known_args
(
input_args
)
# Add model-specific args to parser.
if
hasattr
(
args
,
'arch'
):
model_specific_group
=
parser
.
add_argument_group
(
'Model-specific configuration'
,
# Only include attributes which are explicitly given as command-line
# arguments or which have default values.
argument_default
=
argparse
.
SUPPRESS
,
)
ARCH_MODEL_REGISTRY
[
args
.
arch
].
add_args
(
model_specific_group
)
# Add *-specific args to parser.
if
hasattr
(
args
,
'optimizer'
):
OPTIMIZER_REGISTRY
[
args
.
optimizer
].
add_args
(
parser
)
if
hasattr
(
args
,
'lr_scheduler'
):
LR_SCHEDULER_REGISTRY
[
args
.
lr_scheduler
].
add_args
(
parser
)
# Parse a second time.
if
parse_known
:
args
,
extra
=
parser
.
parse_known_args
(
input_args
)
else
:
args
=
parser
.
parse_args
(
input_args
)
extra
=
None
# Post-process args.
if
hasattr
(
args
,
'max_sentences_valid'
)
and
args
.
max_sentences_valid
is
None
:
args
.
max_sentences_valid
=
args
.
max_sentences
args
.
max_positions
=
(
args
.
max_source_positions
,
args
.
max_target_positions
)
if
hasattr
(
args
,
'target_bleu'
)
and
(
args
.
online_eval
or
args
.
target_bleu
)
and
not
args
.
remove_bpe
:
args
.
remove_bpe
=
'@@ '
# Apply architecture configuration.
if
hasattr
(
args
,
'arch'
):
ARCH_CONFIG_REGISTRY
[
args
.
arch
](
args
)
if
parse_known
:
return
args
,
extra
else
:
return
args
def
get_parser
(
desc
):
parser
=
argparse
.
ArgumentParser
(
description
=
'Facebook AI Research Sequence-to-Sequence Toolkit -- '
+
desc
)
parser
.
add_argument
(
'--log-interval'
,
type
=
int
,
default
=
500
,
metavar
=
'N'
,
help
=
'print aggregated stats and flush json log every N iteration'
)
parser
.
add_argument
(
'--seed'
,
default
=
1
,
type
=
int
,
metavar
=
'N'
,
help
=
'pseudo random number generator seed'
)
parser
.
add_argument
(
'--amp'
,
action
=
'store_true'
,
help
=
'use Automatic Mixed Precision'
)
parser
.
add_argument
(
'--stat-file'
,
type
=
str
,
default
=
'run_log.json'
,
help
=
'Name of the file containing DLLogger output'
)
parser
.
add_argument
(
'--save-dir'
,
metavar
=
'DIR'
,
default
=
'results'
,
help
=
'path to save checkpoints and logs'
)
parser
.
add_argument
(
'--do-sanity-check'
,
action
=
'store_true'
,
help
=
'Perform evaluation on test set before running the training'
)
return
parser
def
add_dataset_args
(
parser
,
train
=
False
,
gen
=
False
):
group
=
parser
.
add_argument_group
(
'Dataset and data loading'
)
group
.
add_argument
(
'--max-tokens'
,
type
=
int
,
metavar
=
'N'
,
help
=
'maximum number of tokens in a batch'
)
group
.
add_argument
(
'--max-sentences'
,
'--batch-size'
,
type
=
int
,
metavar
=
'N'
,
help
=
'maximum number of sentences in a batch'
)
parser
.
add_argument
(
'-s'
,
'--source-lang'
,
default
=
None
,
metavar
=
'SRC'
,
help
=
'source language'
)
parser
.
add_argument
(
'-t'
,
'--target-lang'
,
default
=
None
,
metavar
=
'TARGET'
,
help
=
'target language'
)
parser
.
add_argument
(
'--raw-text'
,
action
=
'store_true'
,
help
=
'load raw text dataset'
)
parser
.
add_argument
(
'--left-pad-source'
,
default
=
True
,
type
=
bool
,
metavar
=
'BOOL'
,
help
=
'pad the source on the left (default: True)'
)
parser
.
add_argument
(
'--left-pad-target'
,
default
=
False
,
type
=
bool
,
metavar
=
'BOOL'
,
help
=
'pad the target on the left (default: False)'
)
parser
.
add_argument
(
'--max-source-positions'
,
default
=
1024
,
type
=
int
,
metavar
=
'N'
,
help
=
'max number of tokens in the source sequence'
)
parser
.
add_argument
(
'--max-target-positions'
,
default
=
1024
,
type
=
int
,
metavar
=
'N'
,
help
=
'max number of tokens in the target sequence'
)
parser
.
add_argument
(
'--pad-sequence'
,
default
=
1
,
type
=
int
,
metavar
=
'N'
,
help
=
'Pad sequences to a multiple of N'
)
if
train
:
parser
.
add_argument
(
'data'
,
metavar
=
'DIR'
,
help
=
'path to data directory'
)
group
.
add_argument
(
'--train-subset'
,
default
=
'train'
,
metavar
=
'SPLIT'
,
choices
=
[
'train'
,
'valid'
,
'test'
],
help
=
'data subset to use for training (train, valid, test)'
)
group
.
add_argument
(
'--valid-subset'
,
default
=
'valid'
,
metavar
=
'SPLIT'
,
help
=
'comma separated list of data subsets to use for validation'
' (train, valid, valid1, test, test1)'
)
group
.
add_argument
(
'--max-sentences-valid'
,
type
=
int
,
metavar
=
'N'
,
help
=
'maximum number of sentences in a validation batch'
' (defaults to --max-sentences)'
)
if
gen
:
group
.
add_argument
(
'--gen-subset'
,
default
=
'test'
,
metavar
=
'SPLIT'
,
help
=
'data subset to generate (train, valid, test)'
)
group
.
add_argument
(
'--num-shards'
,
default
=
1
,
type
=
int
,
metavar
=
'N'
,
help
=
'shard generation over N shards'
)
group
.
add_argument
(
'--shard-id'
,
default
=
0
,
type
=
int
,
metavar
=
'ID'
,
help
=
'id of the shard to generate (id < num_shards)'
)
return
group
def
add_distributed_training_args
(
parser
):
group
=
parser
.
add_argument_group
(
'Distributed training'
)
group
.
add_argument
(
'--distributed-world-size'
,
type
=
int
,
metavar
=
'N'
,
default
=
torch
.
cuda
.
device_count
(),
help
=
'total number of GPUs across all nodes (default: all visible GPUs)'
)
group
.
add_argument
(
'--distributed-rank'
,
default
=
os
.
getenv
(
'LOCAL_RANK'
,
0
),
type
=
int
,
help
=
'rank of the current worker'
)
group
.
add_argument
(
'--local_rank'
,
default
=
0
,
type
=
int
,
help
=
'rank of the current worker'
)
group
.
add_argument
(
'--distributed-backend'
,
default
=
'nccl'
,
type
=
str
,
help
=
'distributed backend'
)
group
.
add_argument
(
'--distributed-init-method'
,
default
=
None
,
type
=
str
,
help
=
'typically tcp://hostname:port that will be used to '
'establish initial connetion'
)
group
.
add_argument
(
'--distributed-port'
,
default
=-
1
,
type
=
int
,
help
=
'port number (not required if using --distributed-init-method)'
)
group
.
add_argument
(
'--device-id'
,
default
=
0
,
type
=
int
,
help
=
'which GPU to use (usually configured automatically)'
)
return
group
def
add_optimization_args
(
parser
):
group
=
parser
.
add_argument_group
(
'Optimization'
)
group
.
add_argument
(
'--max-epoch'
,
'--me'
,
default
=
0
,
type
=
int
,
metavar
=
'N'
,
help
=
'force stop training at specified epoch'
)
group
.
add_argument
(
'--max-update'
,
'--mu'
,
default
=
0
,
type
=
int
,
metavar
=
'N'
,
help
=
'force stop training at specified update'
)
group
.
add_argument
(
'--target-bleu'
,
default
=
0.0
,
type
=
float
,
metavar
=
'TARGET'
,
help
=
'force stop training after reaching target bleu'
)
group
.
add_argument
(
'--clip-norm'
,
default
=
25
,
type
=
float
,
metavar
=
'NORM'
,
help
=
'clip threshold of gradients'
)
group
.
add_argument
(
'--update-freq'
,
default
=
[
1
],
nargs
=
'+'
,
type
=
int
,
help
=
'update parameters every N_i batches, when in epoch i'
)
# Optimizer definitions can be found under fairseq/optim/
group
.
add_argument
(
'--optimizer'
,
default
=
'nag'
,
metavar
=
'OPT'
,
choices
=
OPTIMIZER_REGISTRY
.
keys
(),
help
=
'optimizer: {} (default: nag)'
.
format
(
', '
.
join
(
OPTIMIZER_REGISTRY
.
keys
())))
group
.
add_argument
(
'--lr'
,
'--learning-rate'
,
default
=
[
0.25
],
nargs
=
'+'
,
type
=
float
,
help
=
'learning rate for the first N epochs; all epochs >N using LR_N'
' (note: this may be interpreted differently depending on --lr-scheduler)'
)
group
.
add_argument
(
'--momentum'
,
default
=
0.99
,
type
=
float
,
metavar
=
'M'
,
help
=
'momentum factor'
)
group
.
add_argument
(
'--weight-decay'
,
'--wd'
,
default
=
0.0
,
type
=
float
,
metavar
=
'WD'
,
help
=
'weight decay'
)
# Learning rate schedulers can be found under fairseq/optim/lr_scheduler/
group
.
add_argument
(
'--lr-scheduler'
,
default
=
'reduce_lr_on_plateau'
,
help
=
'learning rate scheduler: {} (default: reduce_lr_on_plateau)'
.
format
(
', '
.
join
(
LR_SCHEDULER_REGISTRY
.
keys
())))
group
.
add_argument
(
'--lr-shrink'
,
default
=
0.1
,
type
=
float
,
metavar
=
'LS'
,
help
=
'learning rate shrink factor for annealing, lr_new = (lr * lr_shrink)'
)
group
.
add_argument
(
'--min-lr'
,
default
=
1e-5
,
type
=
float
,
metavar
=
'LR'
,
help
=
'minimum learning rate'
)
# Criterion args
parser
.
add_argument
(
'--label-smoothing'
,
default
=
0.
,
type
=
float
,
metavar
=
'D'
,
help
=
'epsilon for label smoothing, 0 means no label smoothing'
)
return
group
def
add_checkpoint_args
(
parser
):
group
=
parser
.
add_argument_group
(
'Checkpointing'
)
group
.
add_argument
(
'--restore-file'
,
default
=
'checkpoint_last.pt'
,
help
=
'filename in save-dir from which to load checkpoint'
)
group
.
add_argument
(
'--save-interval'
,
type
=
int
,
default
=
1
,
metavar
=
'N'
,
help
=
'save a checkpoint every N epochs'
)
group
.
add_argument
(
'--no-save'
,
action
=
'store_true'
,
help
=
'don
\'
t save models or checkpoints'
)
group
.
add_argument
(
'--no-epoch-checkpoints'
,
action
=
'store_true'
,
help
=
'only store last and best checkpoints'
)
group
.
add_argument
(
'--validate-interval'
,
type
=
int
,
default
=
1
,
metavar
=
'N'
,
help
=
'validate every N epochs'
)
return
group
def
add_common_eval_args
(
group
):
group
.
add_argument
(
'--path'
,
metavar
=
'FILE'
,
help
=
'path(s) to model file(s), colon separated'
)
group
.
add_argument
(
'--file'
,
metavar
=
'FILE'
,
default
=
None
,
type
=
str
,
help
=
'path to a file with input data for inference'
)
group
.
add_argument
(
'--remove-bpe'
,
nargs
=
'?'
,
const
=
'@@ '
,
default
=
None
,
help
=
'remove BPE tokens before scoring'
)
group
.
add_argument
(
'--cpu'
,
action
=
'store_true'
,
help
=
'generate on CPU'
)
group
.
add_argument
(
'--quiet'
,
action
=
'store_true'
,
help
=
'only print final scores'
)
def
add_inference_args
(
parser
):
group
=
parser
.
add_argument_group
(
'Generation'
)
add_common_eval_args
(
group
)
group
.
add_argument
(
'--beam'
,
default
=
4
,
type
=
int
,
metavar
=
'N'
,
help
=
'beam size'
)
group
.
add_argument
(
'--nbest'
,
default
=
1
,
type
=
int
,
metavar
=
'N'
,
help
=
'number of hypotheses to output'
)
group
.
add_argument
(
'--max-len-a'
,
default
=
0
,
type
=
float
,
metavar
=
'N'
,
help
=
(
'generate sequences of maximum length ax + b, '
'where x is the source length'
))
group
.
add_argument
(
'--max-len-b'
,
default
=
200
,
type
=
int
,
metavar
=
'N'
,
help
=
(
'generate sequences of maximum length ax + b, '
'where x is the source length'
))
group
.
add_argument
(
'--min-len'
,
default
=
1
,
type
=
float
,
metavar
=
'N'
,
help
=
(
'minimum generation length'
))
group
.
add_argument
(
'--no-early-stop'
,
action
=
'store_true'
,
help
=
(
'continue searching even after finalizing k=beam '
'hypotheses; this is more correct, but increases '
'generation time by 50%%'
))
group
.
add_argument
(
'--unnormalized'
,
action
=
'store_true'
,
help
=
'compare unnormalized hypothesis scores'
)
group
.
add_argument
(
'--no-beamable-mm'
,
action
=
'store_true'
,
help
=
'don
\'
t use BeamableMM in attention layers'
)
group
.
add_argument
(
'--lenpen'
,
default
=
1
,
type
=
float
,
help
=
'length penalty: <1.0 favors shorter, >1.0 favors longer sentences'
)
group
.
add_argument
(
'--unkpen'
,
default
=
0
,
type
=
float
,
help
=
'unknown word penalty: <0 produces more unks, >0 produces fewer'
)
group
.
add_argument
(
'--replace-unk'
,
nargs
=
'?'
,
const
=
True
,
default
=
None
,
help
=
'perform unknown replacement (optionally with alignment dictionary)'
)
group
.
add_argument
(
'--prefix-size'
,
default
=
0
,
type
=
int
,
metavar
=
'PS'
,
help
=
'initialize generation by target prefix of given length'
)
group
.
add_argument
(
'--sampling'
,
action
=
'store_true'
,
help
=
'sample hypotheses instead of using beam search'
)
group
.
add_argument
(
'--sampling-topk'
,
default
=-
1
,
type
=
int
,
metavar
=
'PS'
,
help
=
'sample from top K likely next words instead of all words'
)
group
.
add_argument
(
'--sampling-temperature'
,
default
=
1
,
type
=
float
,
metavar
=
'N'
,
help
=
'temperature for random sampling'
)
group
.
add_argument
(
'--print-alignment'
,
action
=
'store_true'
,
help
=
'if set, uses attention feedback to compute and print alignment to source tokens'
)
group
.
add_argument
(
'--online-eval'
,
action
=
'store_true'
,
help
=
'score model at the end of epoch'
)
group
.
add_argument
(
'--save-predictions'
,
action
=
'store_true'
,
help
=
'Save predictions produced with online evaluation'
)
group
.
add_argument
(
'--test-cased-bleu'
,
action
=
'store_true'
,
help
=
'Use cased bleu for online eval'
)
group
.
add_argument
(
'--bpe-codes'
,
default
=
None
,
type
=
str
,
metavar
=
'CODES'
,
help
=
'file with bpe codes'
)
group
.
add_argument
(
'--buffer-size'
,
default
=
64
,
type
=
int
,
metavar
=
'N'
,
help
=
'read this many sentences into a buffer before processing them'
)
group
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
'use fp16 precision'
)
return
group
def
add_model_args
(
parser
):
group
=
parser
.
add_argument_group
(
'Model configuration'
)
# Model definitions can be found under fairseq/models/
#
# The model architecture can be specified in several ways.
# In increasing order of priority:
# 1) model defaults (lowest priority)
# 2) --arch argument
group
.
add_argument
(
'--arch'
,
'-a'
,
default
=
'fconv'
,
metavar
=
'ARCH'
,
required
=
True
,
choices
=
ARCH_MODEL_REGISTRY
.
keys
(),
help
=
'model architecture: {} (default: fconv)'
.
format
(
', '
.
join
(
ARCH_MODEL_REGISTRY
.
keys
())),
)
# Criterion definitions can be found under fairseq/criterions/
group
.
add_argument
(
'--criterion'
,
default
=
'cross_entropy'
,
metavar
=
'CRIT'
,
choices
=
CRITERION_REGISTRY
.
keys
(),
help
=
'training criterion: {} (default: cross_entropy)'
.
format
(
', '
.
join
(
CRITERION_REGISTRY
.
keys
())),
)
return
group
def
add_perf_args
(
parser
):
group
=
parser
.
add_argument_group
(
'Performance'
)
group
.
add_argument
(
'--fuse-dropout-add'
,
action
=
'store_true'
,
help
=
'Fuse dropout and residual adds.'
)
group
.
add_argument
(
'--fuse-relu-dropout'
,
action
=
'store_true'
,
help
=
'Fuse Relu and Dropout.'
)
group
.
add_argument
(
'--fuse-layer-norm'
,
action
=
'store_true'
,
help
=
'Use APEX
\'
s FusedLayerNorm instead of torch.nn.LayerNorm'
)
return
group
Prev
1
2
3
4
5
6
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment