Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ccb6947d
Commit
ccb6947d
authored
Jul 11, 2019
by
thomwolf
Browse files
optimization tests
parent
e4f9dca0
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
93 additions
and
51 deletions
+93
-51
examples/run_glue.py
examples/run_glue.py
+18
-11
examples/test_examples.py
examples/test_examples.py
+12
-3
pytorch_transformers/optimization.py
pytorch_transformers/optimization.py
+12
-11
pytorch_transformers/tests/optimization_test.py
pytorch_transformers/tests/optimization_test.py
+51
-26
No files found.
examples/run_glue.py
View file @
ccb6947d
...
...
@@ -96,8 +96,10 @@ def train(args, train_dataset, model, tokenizer):
global_step
=
0
tr_loss
,
logging_loss
=
0.0
,
0.0
model
.
zero_grad
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
train_iterator
=
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])
for
_
in
train_iterator
:
epoch_iterator
=
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])
for
step
,
batch
in
enumerate
(
epoch_iterator
):
model
.
train
()
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
inputs
=
{
'input_ids'
:
batch
[
0
],
...
...
@@ -129,7 +131,7 @@ def train(args, train_dataset, model, tokenizer):
if
args
.
local_rank
in
[
-
1
,
0
]
and
args
.
logging_steps
>
0
and
global_step
%
args
.
logging_steps
==
0
:
# Log metrics
if
args
.
local_rank
==
-
1
:
# Only evaluate when single GPU otherwise metrics may not average well
if
args
.
local_rank
==
-
1
and
args
.
evaluate_during_training
:
# Only evaluate when single GPU otherwise metrics may not average well
results
=
evaluate
(
args
,
model
,
tokenizer
)
for
key
,
value
in
results
.
items
():
tb_writer
.
add_scalar
(
'eval_{}'
.
format
(
key
),
value
,
global_step
)
...
...
@@ -148,8 +150,10 @@ def train(args, train_dataset, model, tokenizer):
logger
.
info
(
"Saving model checkpoint to %s"
,
output_dir
)
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
epoch_iterator
.
close
()
break
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
train_iterator
.
close
()
break
return
global_step
,
tr_loss
/
global_step
...
...
@@ -164,11 +168,10 @@ def evaluate(args, model, tokenizer, prefix=""):
for
eval_task
,
eval_output_dir
in
zip
(
eval_task_names
,
eval_outputs_dirs
):
eval_dataset
=
load_and_cache_examples
(
args
,
eval_task
,
tokenizer
,
evaluate
=
True
)
""" Evaluate the model """
if
not
os
.
path
.
exists
(
eval_output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
eval_output_dir
)
args
.
eval_batch_size
=
args
.
per_gpu_eval_batch_size
*
args
.
n_gpu
args
.
eval_batch_size
=
args
.
per_gpu_eval_batch_size
*
max
(
1
,
args
.
n_gpu
)
# Note that DistributedSampler samples randomly
eval_sampler
=
SequentialSampler
(
eval_dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
eval_dataset
)
eval_dataloader
=
DataLoader
(
eval_dataset
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
...
...
@@ -177,7 +180,7 @@ def evaluate(args, model, tokenizer, prefix=""):
logger
.
info
(
"***** Running evaluation {} *****"
.
format
(
prefix
))
logger
.
info
(
" Num examples = %d"
,
len
(
eval_dataset
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
eval_loss
=
0
eval_loss
=
0
.0
nb_eval_steps
=
0
preds
=
None
out_label_ids
=
None
...
...
@@ -287,6 +290,8 @@ def main():
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--evaluate_during_training"
,
action
=
'store_true'
,
help
=
"Rul evaluation during training at each logging step."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
...
...
@@ -364,7 +369,7 @@ def main():
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
warning
(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s"
,
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
)
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
)
# Setup seeds
random
.
seed
(
args
.
seed
)
...
...
@@ -409,6 +414,8 @@ def main():
elif
args
.
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
logger
.
info
(
"Training/evaluation parameters %s"
,
args
)
# Training
if
args
.
do_train
:
train_dataset
=
load_and_cache_examples
(
args
,
args
.
task_name
,
tokenizer
,
evaluate
=
False
)
...
...
@@ -438,22 +445,22 @@ def main():
model
.
to
(
args
.
device
)
# Evaluation
results
=
{}
if
args
.
do_eval
and
args
.
local_rank
in
[
-
1
,
0
]:
checkpoints
=
[
args
.
output_dir
+
'./'
+
WEIGHTS_NAME
]
checkpoints
=
[
args
.
output_dir
]
if
args
.
eval_all_checkpoints
:
checkpoints
=
list
(
os
.
path
.
dirname
(
c
)
for
c
in
sorted
(
glob
.
glob
(
args
.
output_dir
+
'/**/'
+
WEIGHTS_NAME
,
recursive
=
True
)))
logging
.
getLogger
(
"pytorch_transformers.modeling_utils"
).
setLevel
(
logging
.
WARN
)
# Reduce logging
logger
.
info
(
"Evaluate the following checkpoints: %s"
,
checkpoints
)
results
=
{}
for
checkpoint
in
checkpoints
:
global_step
=
int
(
checkpoint
.
split
(
'-'
)[
-
1
]
)
global_step
=
checkpoint
.
split
(
'-'
)[
-
1
]
model
=
model_class
.
from_pretrained
(
checkpoint
)
model
.
to
(
args
.
device
)
result
=
evaluate
(
args
,
model
,
tokenizer
,
prefix
=
global_step
)
result
=
dict
((
k
+
'_{}'
.
format
(
global_step
),
v
)
for
k
,
v
in
result
.
items
())
results
.
update
(
result
)
return
results
return
results
if
__name__
==
"__main__"
:
...
...
examples/test_examples.py
View file @
ccb6947d
...
...
@@ -45,9 +45,18 @@ class ExamplesTests(unittest.TestCase):
stream_handler
=
logging
.
StreamHandler
(
sys
.
stdout
)
logger
.
addHandler
(
stream_handler
)
testargs
=
[
"run_glue.py"
,
"--data_dir=./examples/tests_samples/MRPC/"
,
"--task_name=mrpc"
,
"--do_train"
,
"--do_eval"
,
"--output_dir=./examples/tests_samples/temp_dir"
,
"--train_batch_size=4"
,
"--eval_batch_size=2"
,
"--num_train_epochs=2.0"
,
"--overwrite_output_dir"
]
testargs
=
[
"run_glue.py"
,
"--data_dir=./examples/tests_samples/MRPC/"
,
"--task_name=mrpc"
,
"--do_train"
,
"--do_eval"
,
"--output_dir=./examples/tests_samples/temp_dir"
,
"--per_gpu_train_batch_size=2"
,
"--per_gpu_eval_batch_size=1"
,
"--learning_rate=1e-4"
,
"--max_steps=10"
,
"--warmup_steps=2"
,
"--overwrite_output_dir"
]
model_name
=
"--model_name=bert-base-uncased"
with
patch
.
object
(
sys
,
'argv'
,
testargs
+
[
model_name
]):
result
=
run_glue
.
main
()
...
...
pytorch_transformers/optimization.py
View file @
ccb6947d
...
...
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
class
ConstantLRSchedule
(
LambdaLR
):
def
__init__
(
self
,
optimizer
,
last_epoch
=-
1
):
super
(
ConstantLRSchedule
,
self
).
__init__
(
optimizer
,
lambda
x
:
x
,
last_epoch
=
last_epoch
)
super
(
ConstantLRSchedule
,
self
).
__init__
(
optimizer
,
lambda
_
:
1.0
,
last_epoch
=
last_epoch
)
class
WarmupCosineSchedule
(
LambdaLR
):
"""
...
...
@@ -42,10 +42,10 @@ class WarmupCosineSchedule(LambdaLR):
def
lr_lambda
(
step
):
if
step
<
warmup_steps
:
return
step
/
max
(
1
,
warmup_steps
)
return
float
(
step
)
/
float
(
max
(
1
.0
,
warmup_steps
)
)
else
:
progress
=
(
step
-
warmup_steps
)
/
max
(
1
,
t_total
-
warmup_steps
)
# progress after warmup
return
0.5
*
(
1.
+
math
.
cos
(
math
.
pi
*
cycles
*
2
*
progress
))
progress
=
float
(
step
-
warmup_steps
)
/
float
(
max
(
1
,
t_total
-
warmup_steps
)
)
# progress after warmup
return
0.5
*
(
1.
+
math
.
cos
(
math
.
pi
*
float
(
cycles
)
*
2
.0
*
progress
))
super
(
WarmupCosineSchedule
,
self
).
__init__
(
optimizer
,
lr_lambda
,
last_epoch
=
last_epoch
)
...
...
@@ -59,11 +59,12 @@ class WarmupCosineWithHardRestartsSchedule(LambdaLR):
def
lr_lambda
(
step
):
if
step
<
warmup_steps
:
return
step
/
max
(
1
,
warmup_steps
)
return
float
(
step
)
/
float
(
max
(
1
,
warmup_steps
)
)
else
:
progress
=
(
step
-
warmup_steps
)
/
max
(
1
,
t_total
-
warmup_steps
)
# progress after warmup
ret
=
0.5
*
(
1.
+
math
.
cos
(
math
.
pi
*
((
cycles
*
progress
)
%
1
)))
return
ret
progress
=
float
(
step
-
warmup_steps
)
/
float
(
max
(
1
,
t_total
-
warmup_steps
))
# progress after warmup
if
progress
>=
1.0
:
return
0.0
return
0.5
*
(
1.
+
math
.
cos
(
math
.
pi
*
((
float
(
cycles
)
*
progress
)
%
1.0
)))
super
(
WarmupCosineWithHardRestartsSchedule
,
self
).
__init__
(
optimizer
,
lr_lambda
,
last_epoch
=
last_epoch
)
...
...
@@ -77,7 +78,7 @@ class WarmupConstantSchedule(LambdaLR):
def
lr_lambda
(
step
):
if
step
<
warmup_steps
:
return
step
/
warmup_steps
return
float
(
step
)
/
float
(
max
(
1.0
,
warmup_steps
))
return
1.
super
(
WarmupConstantSchedule
,
self
).
__init__
(
optimizer
,
lr_lambda
,
last_epoch
=
last_epoch
)
...
...
@@ -92,8 +93,8 @@ class WarmupLinearSchedule(LambdaLR):
def
lr_lambda
(
step
):
if
step
<
warmup_steps
:
return
step
/
max
(
1
,
warmup_steps
)
return
(
t_total
-
step
)
/
max
(
1
,
t_total
-
warmup_steps
)
return
float
(
step
)
/
float
(
max
(
1
,
warmup_steps
)
)
return
float
(
t_total
-
step
)
/
float
(
max
(
1
.0
,
t_total
-
warmup_steps
)
)
super
(
WarmupLinearSchedule
,
self
).
__init__
(
optimizer
,
lr_lambda
,
last_epoch
=
last_epoch
)
...
...
pytorch_transformers/tests/optimization_test.py
View file @
ccb6947d
...
...
@@ -26,6 +26,13 @@ from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSched
import
numpy
as
np
def
unwrap_schedule
(
scheduler
,
num_steps
=
10
):
lrs
=
[]
for
_
in
range
(
num_steps
):
scheduler
.
step
()
lrs
.
append
(
scheduler
.
get_lr
())
return
lrs
class
OptimizationTest
(
unittest
.
TestCase
):
def
assertListAlmostEqual
(
self
,
list1
,
list2
,
tol
):
...
...
@@ -38,9 +45,7 @@ class OptimizationTest(unittest.TestCase):
target
=
torch
.
tensor
([
0.4
,
0.2
,
-
0.5
])
criterion
=
torch
.
nn
.
MSELoss
()
# No warmup, constant schedule, no gradient clipping
optimizer
=
AdamW
(
params
=
[
w
],
lr
=
2e-1
,
weight_decay
=
0.0
,
max_grad_norm
=-
1
)
optimizer
=
AdamW
(
params
=
[
w
],
lr
=
2e-1
,
weight_decay
=
0.0
)
for
_
in
range
(
100
):
loss
=
criterion
(
w
,
target
)
loss
.
backward
()
...
...
@@ -51,29 +56,49 @@ class OptimizationTest(unittest.TestCase):
class
ScheduleInitTest
(
unittest
.
TestCase
):
def
test_sched_init
(
self
):
m
=
torch
.
nn
.
Linear
(
50
,
50
)
optim
=
AdamW
(
m
.
parameters
(),
lr
=
0.001
,
warmup
=
.
1
,
t_total
=
1000
,
schedule
=
None
)
self
.
assertTrue
(
isinstance
(
optim
.
param_groups
[
0
][
"schedule"
],
ConstantLR
))
optim
=
AdamW
(
m
.
parameters
(),
lr
=
0.001
,
warmup
=
.
1
,
t_total
=
1000
,
schedule
=
"none"
)
self
.
assertTrue
(
isinstance
(
optim
.
param_groups
[
0
][
"schedule"
],
ConstantLR
))
optim
=
AdamW
(
m
.
parameters
(),
lr
=
0.001
,
warmup
=
.
01
,
t_total
=
1000
)
self
.
assertTrue
(
isinstance
(
optim
.
param_groups
[
0
][
"schedule"
],
WarmupLinearSchedule
))
# shouldn't fail
class
WarmupCosineWithRestartsTest
(
unittest
.
TestCase
):
def
test_it
(
self
):
m
=
WarmupCosineWithWarmupRestartsSchedule
(
warmup
=
0.05
,
t_total
=
1000.
,
cycles
=
5
)
x
=
np
.
arange
(
0
,
1000
)
y
=
[
m
.
get_lr
(
xe
)
for
xe
in
x
]
y
=
np
.
asarray
(
y
)
expected_zeros
=
y
[[
0
,
200
,
400
,
600
,
800
]]
print
(
expected_zeros
)
expected_ones
=
y
[[
50
,
250
,
450
,
650
,
850
]]
print
(
expected_ones
)
self
.
assertTrue
(
np
.
allclose
(
expected_ones
,
1
))
self
.
assertTrue
(
np
.
allclose
(
expected_zeros
,
0
))
m
=
torch
.
nn
.
Linear
(
50
,
50
)
optimizer
=
AdamW
(
m
.
parameters
(),
lr
=
10.
)
num_steps
=
10
def
assertListAlmostEqual
(
self
,
list1
,
list2
,
tol
):
self
.
assertEqual
(
len
(
list1
),
len
(
list2
))
for
a
,
b
in
zip
(
list1
,
list2
):
self
.
assertAlmostEqual
(
a
,
b
,
delta
=
tol
)
def
test_constant_scheduler
(
self
):
scheduler
=
ConstantLRSchedule
(
self
.
optimizer
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
10.
]
*
self
.
num_steps
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
def
test_warmup_constant_scheduler
(
self
):
scheduler
=
WarmupConstantSchedule
(
self
.
optimizer
,
warmup_steps
=
4
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
2.5
,
5.0
,
7.5
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
,
10.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
def
test_warmup_linear_scheduler
(
self
):
scheduler
=
WarmupLinearSchedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.75
,
7.5
,
6.25
,
5.0
,
3.75
,
2.5
,
1.25
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
)
def
test_warmup_cosine_scheduler
(
self
):
scheduler
=
WarmupCosineSchedule
(
self
.
optimizer
,
warmup_steps
=
2
,
t_total
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
9.61
,
8.53
,
6.91
,
5.0
,
3.08
,
1.46
,
0.38
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
def
test_warmup_cosine_hard_restart_scheduler
(
self
):
scheduler
=
WarmupCosineWithHardRestartsSchedule
(
self
.
optimizer
,
warmup_steps
=
2
,
cycles
=
2
,
t_total
=
10
)
lrs
=
unwrap_schedule
(
scheduler
,
self
.
num_steps
)
expected_learning_rates
=
[
5.0
,
10.0
,
8.53
,
5.0
,
1.46
,
10.0
,
8.53
,
5.0
,
1.46
,
0.0
]
self
.
assertEqual
(
len
(
lrs
[
0
]),
1
)
self
.
assertListAlmostEqual
([
l
[
0
]
for
l
in
lrs
],
expected_learning_rates
,
tol
=
1e-2
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment