Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
0f544625
Commit
0f544625
authored
Dec 13, 2018
by
thomwolf
Browse files
fix swag example for work with apex
parent
0cf88ff0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
78 deletions
+59
-78
README.md
README.md
+2
-2
examples/run_swag.py
examples/run_swag.py
+56
-75
tests/optimization_test.py
tests/optimization_test.py
+1
-1
No files found.
README.md
View file @
0f544625
...
@@ -442,12 +442,12 @@ python run_swag.py \
...
@@ -442,12 +442,12 @@ python run_swag.py \
--do_train
\
--do_train
\
--do_lower_case
\
--do_lower_case
\
--do_eval
\
--do_eval
\
--data_dir
$SWAG_DIR
/data
--data_dir
$SWAG_DIR
/data
\
--train_batch_size
16
\
--train_batch_size
16
\
--learning_rate
2e-5
\
--learning_rate
2e-5
\
--num_train_epochs
3.0
\
--num_train_epochs
3.0
\
--max_seq_length
80
\
--max_seq_length
80
\
--output_dir
/tmp/swag_output/
--output_dir
/tmp/swag_output/
\
--gradient_accumulation_steps
4
--gradient_accumulation_steps
4
```
```
...
...
examples/run_swag.py
View file @
0f544625
# coding=utf-8
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -232,34 +233,10 @@ def select_field(features, field):
...
@@ -232,34 +233,10 @@ def select_field(features, field):
for
feature
in
features
for
feature
in
features
]
]
def
copy_optimizer_params_to_model
(
named_params_model
,
named_params_optimizer
):
def
warmup_linear
(
x
,
warmup
=
0.002
):
""" Utility function for optimize_on_cpu and 16-bits training.
if
x
<
warmup
:
Copy the parameters optimized on CPU/RAM back to the model on GPU
return
x
/
warmup
"""
return
1.0
-
x
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
param_model
.
data
.
copy_
(
param_opti
.
data
)
def
set_optimizer_params_grad
(
named_params_optimizer
,
named_params_model
,
test_nan
=
False
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
"""
is_nan
=
False
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
if
param_model
.
grad
is
not
None
:
if
test_nan
and
torch
.
isnan
(
param_model
.
grad
).
sum
()
>
0
:
is_nan
=
True
if
param_opti
.
grad
is
None
:
param_opti
.
grad
=
torch
.
nn
.
Parameter
(
param_opti
.
data
.
new
().
resize_
(
*
param_opti
.
data
.
size
()))
param_opti
.
grad
.
data
.
copy_
(
param_model
.
grad
.
data
)
else
:
param_opti
.
grad
=
None
return
is_nan
def
main
():
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
...
@@ -335,17 +312,15 @@ def main():
...
@@ -335,17 +312,15 @@ def main():
type
=
int
,
type
=
int
,
default
=
1
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--optimize_on_cpu'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to perform optimization and keep the optimizer averages on CPU"
)
parser
.
add_argument
(
'--fp16'
,
parser
.
add_argument
(
'--fp16'
,
default
=
False
,
default
=
False
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
128
,
type
=
float
,
default
=
0
,
help
=
'Loss scaling, positive power of 2 values can improve fp16 convergence.'
)
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -353,13 +328,11 @@ def main():
...
@@ -353,13 +328,11 @@ def main():
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
if
args
.
fp16
:
logger
.
info
(
"16-bits training currently not supported in distributed training"
)
args
.
fp16
=
False
# (see https://github.com/pytorch/pytorch/pull/13496)
logger
.
info
(
"device %s n_gpu %d distributed training %r"
,
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
))
logger
.
info
(
"device %s n_gpu %d distributed training %r"
,
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
))
if
args
.
gradient_accumulation_steps
<
1
:
if
args
.
gradient_accumulation_steps
<
1
:
...
@@ -399,28 +372,46 @@ def main():
...
@@ -399,28 +372,46 @@ def main():
model
.
half
()
model
.
half
()
model
.
to
(
device
)
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
try
:
output_device
=
args
.
local_rank
)
from
apex.parallel
import
DistributedDataParallel
as
DDP
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
model
=
DDP
(
model
)
elif
n_gpu
>
1
:
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
# Prepare optimizer
if
args
.
fp16
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
float
().
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
elif
args
.
optimize_on_cpu
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
else
:
param_optimizer
=
list
(
model
.
named_parameters
())
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer
=
[
n
for
n
in
param_optimizer
if
'pooler'
not
in
n
[
0
]]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay
_rate
'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay
_rate
'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
]
t_total
=
num_train_steps
t_total
=
num_train_steps
if
args
.
local_rank
!=
-
1
:
if
args
.
local_rank
!=
-
1
:
t_total
=
t_total
//
torch
.
distributed
.
get_world_size
()
t_total
=
t_total
//
torch
.
distributed
.
get_world_size
()
if
args
.
fp16
:
try
:
from
apex.optimizers
import
FP16_Optimizer
from
apex.optimizers
import
FusedAdam
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
warmup
=
args
.
warmup_proportion
,
...
@@ -461,28 +452,18 @@ def main():
...
@@ -461,28 +452,18 @@ def main():
loss
=
loss
*
args
.
loss_scale
loss
=
loss
*
args
.
loss_scale
if
args
.
gradient_accumulation_steps
>
1
:
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
or
args
.
optimize_on_cpu
:
# modify learning rate with special warm up BERT uses
if
args
.
fp16
and
args
.
loss_scale
!=
1.0
:
lr_this_step
=
args
.
learning_rate
*
warmup_linear
(
global_step
/
t_total
,
args
.
warmup_proportion
)
# scale down gradients for fp16 training
for
param_group
in
optimizer
.
param_groups
:
for
param
in
model
.
parameters
():
param_group
[
'lr'
]
=
lr_this_step
if
param
.
grad
is
not
None
:
param
.
grad
.
data
=
param
.
grad
.
data
/
args
.
loss_scale
is_nan
=
set_optimizer_params_grad
(
param_optimizer
,
model
.
named_parameters
(),
test_nan
=
True
)
if
is_nan
:
logger
.
info
(
"FP16 TRAINING: Nan in gradients, reducing loss scaling"
)
args
.
loss_scale
=
args
.
loss_scale
/
2
model
.
zero_grad
()
continue
optimizer
.
step
()
copy_optimizer_params_to_model
(
model
.
named_parameters
(),
param_optimizer
)
else
:
optimizer
.
step
()
optimizer
.
step
()
model
.
zero_grad
()
optimizer
.
zero_grad
()
global_step
+=
1
global_step
+=
1
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
...
...
tests/optimization_test.py
View file @
0f544625
...
@@ -35,7 +35,7 @@ class OptimizationTest(unittest.TestCase):
...
@@ -35,7 +35,7 @@ class OptimizationTest(unittest.TestCase):
criterion
=
torch
.
nn
.
MSELoss
()
criterion
=
torch
.
nn
.
MSELoss
()
# No warmup, constant schedule, no gradient clipping
# No warmup, constant schedule, no gradient clipping
optimizer
=
BertAdam
(
params
=
[
w
],
lr
=
2e-1
,
optimizer
=
BertAdam
(
params
=
[
w
],
lr
=
2e-1
,
weight_decay
_rate
=
0.0
,
weight_decay
=
0.0
,
max_grad_norm
=-
1
)
max_grad_norm
=-
1
)
for
_
in
range
(
100
):
for
_
in
range
(
100
):
loss
=
criterion
(
w
,
target
)
loss
=
criterion
(
w
,
target
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment