Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0f544625
Commit
0f544625
authored
Dec 13, 2018
by
thomwolf
Browse files
fix swag example for work with apex
parent
0cf88ff0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
78 deletions
+59
-78
README.md
README.md
+2
-2
examples/run_swag.py
examples/run_swag.py
+56
-75
tests/optimization_test.py
tests/optimization_test.py
+1
-1
No files found.
README.md
View file @
0f544625
...
...
@@ -442,12 +442,12 @@ python run_swag.py \
--do_train
\
--do_lower_case
\
--do_eval
\
--data_dir
$SWAG_DIR
/data
--data_dir
$SWAG_DIR
/data
\
--train_batch_size
16
\
--learning_rate
2e-5
\
--num_train_epochs
3.0
\
--max_seq_length
80
\
--output_dir
/tmp/swag_output/
--output_dir
/tmp/swag_output/
\
--gradient_accumulation_steps
4
```
...
...
examples/run_swag.py
View file @
0f544625
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -232,34 +233,10 @@ def select_field(features, field):
for
feature
in
features
]
def
copy_optimizer_params_to_model
(
named_params_model
,
named_params_optimizer
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the parameters optimized on CPU/RAM back to the model on GPU
"""
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
param_model
.
data
.
copy_
(
param_opti
.
data
)
def
set_optimizer_params_grad
(
named_params_optimizer
,
named_params_model
,
test_nan
=
False
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
"""
is_nan
=
False
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
if
param_model
.
grad
is
not
None
:
if
test_nan
and
torch
.
isnan
(
param_model
.
grad
).
sum
()
>
0
:
is_nan
=
True
if
param_opti
.
grad
is
None
:
param_opti
.
grad
=
torch
.
nn
.
Parameter
(
param_opti
.
data
.
new
().
resize_
(
*
param_opti
.
data
.
size
()))
param_opti
.
grad
.
data
.
copy_
(
param_model
.
grad
.
data
)
else
:
param_opti
.
grad
=
None
return
is_nan
def
warmup_linear
(
x
,
warmup
=
0.002
):
if
x
<
warmup
:
return
x
/
warmup
return
1.0
-
x
def
main
():
parser
=
argparse
.
ArgumentParser
()
...
...
@@ -335,17 +312,15 @@ def main():
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--optimize_on_cpu'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to perform optimization and keep the optimizer averages on CPU"
)
parser
.
add_argument
(
'--fp16'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
128
,
help
=
'Loss scaling, positive power of 2 values can improve fp16 convergence.'
)
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
args
=
parser
.
parse_args
()
...
...
@@ -353,13 +328,11 @@ def main():
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
if
args
.
fp16
:
logger
.
info
(
"16-bits training currently not supported in distributed training"
)
args
.
fp16
=
False
# (see https://github.com/pytorch/pytorch/pull/13496)
logger
.
info
(
"device %s n_gpu %d distributed training %r"
,
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
))
if
args
.
gradient_accumulation_steps
<
1
:
...
...
@@ -399,28 +372,46 @@ def main():
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
)
try
:
from
apex.parallel
import
DistributedDataParallel
as
DDP
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
model
=
DDP
(
model
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
if
args
.
fp16
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
float
().
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
elif
args
.
optimize_on_cpu
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
else
:
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer
=
[
n
for
n
in
param_optimizer
if
'pooler'
not
in
n
[
0
]]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay
_rate
'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay
_rate
'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
t_total
=
num_train_steps
if
args
.
local_rank
!=
-
1
:
t_total
=
t_total
//
torch
.
distributed
.
get_world_size
()
if
args
.
fp16
:
try
:
from
apex.optimizers
import
FP16_Optimizer
from
apex.optimizers
import
FusedAdam
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
...
...
@@ -461,28 +452,18 @@ def main():
loss
=
loss
*
args
.
loss_scale
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
or
args
.
optimize_on_cpu
:
if
args
.
fp16
and
args
.
loss_scale
!=
1.0
:
# scale down gradients for fp16 training
for
param
in
model
.
parameters
():
if
param
.
grad
is
not
None
:
param
.
grad
.
data
=
param
.
grad
.
data
/
args
.
loss_scale
is_nan
=
set_optimizer_params_grad
(
param_optimizer
,
model
.
named_parameters
(),
test_nan
=
True
)
if
is_nan
:
logger
.
info
(
"FP16 TRAINING: Nan in gradients, reducing loss scaling"
)
args
.
loss_scale
=
args
.
loss_scale
/
2
model
.
zero_grad
()
continue
optimizer
.
step
()
copy_optimizer_params_to_model
(
model
.
named_parameters
(),
param_optimizer
)
else
:
# modify learning rate with special warm up BERT uses
lr_this_step
=
args
.
learning_rate
*
warmup_linear
(
global_step
/
t_total
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
model
.
zero_grad
()
optimizer
.
zero_grad
()
global_step
+=
1
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
...
...
tests/optimization_test.py
View file @
0f544625
...
...
@@ -35,7 +35,7 @@ class OptimizationTest(unittest.TestCase):
criterion
=
torch
.
nn
.
MSELoss
()
# No warmup, constant schedule, no gradient clipping
optimizer
=
BertAdam
(
params
=
[
w
],
lr
=
2e-1
,
weight_decay
_rate
=
0.0
,
weight_decay
=
0.0
,
max_grad_norm
=-
1
)
for
_
in
range
(
100
):
loss
=
criterion
(
w
,
target
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment