Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5f04aa00
Commit
5f04aa00
authored
Nov 09, 2018
by
thomwolf
Browse files
option to perform optimization and keep the optimizer averages on CPU
parent
9e95cd8c
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
14 deletions
+23
-14
run_squad.py
run_squad.py
+23
-14
No files found.
run_squad.py
View file @
5f04aa00
...
@@ -719,7 +719,6 @@ def main():
...
@@ -719,7 +719,6 @@ def main():
parser
.
add_argument
(
"--max_answer_length"
,
default
=
30
,
type
=
int
,
parser
.
add_argument
(
"--max_answer_length"
,
default
=
30
,
type
=
int
,
help
=
"The maximum length of an answer that can be generated. This is needed because the start "
help
=
"The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another."
)
"and end predictions are not conditioned on one another."
)
parser
.
add_argument
(
"--verbose_logging"
,
default
=
False
,
action
=
'store_true'
,
parser
.
add_argument
(
"--verbose_logging"
,
default
=
False
,
action
=
'store_true'
,
help
=
"If true, all of the warnings related to data processing will be printed. "
help
=
"If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation."
)
"A number of warnings are expected for a normal SQuAD evaluation."
)
...
@@ -727,10 +726,6 @@ def main():
...
@@ -727,10 +726,6 @@ def main():
default
=
False
,
default
=
False
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
parser
.
add_argument
(
'--seed'
,
type
=
int
,
type
=
int
,
default
=
42
,
default
=
42
,
...
@@ -738,7 +733,16 @@ def main():
...
@@ -738,7 +733,16 @@ def main():
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
type
=
int
,
default
=
1
,
default
=
1
,
help
=
"Number of updates steps to accumualte before performing a backward/update pass."
)
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--optimize_on_cpu'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to perform optimization and keep the optimizer averages on CPU"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -802,25 +806,26 @@ def main():
...
@@ -802,25 +806,26 @@ def main():
model
=
BertForQuestionAnswering
(
bert_config
)
model
=
BertForQuestionAnswering
(
bert_config
)
if
args
.
init_checkpoint
is
not
None
:
if
args
.
init_checkpoint
is
not
None
:
model
.
bert
.
load_state_dict
(
torch
.
load
(
args
.
init_checkpoint
,
map_location
=
'cpu'
))
model
.
bert
.
load_state_dict
(
torch
.
load
(
args
.
init_checkpoint
,
map_location
=
'cpu'
))
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
if
not
args
.
optimize_on_cpu
:
model
.
to
(
device
)
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
optimizer_parameters
=
[
optimizer_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
not
in
no_decay
],
'weight_decay_rate'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
not
in
no_decay
],
'weight_decay_rate'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
in
no_decay
],
'weight_decay_rate'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
n
in
no_decay
],
'weight_decay_rate'
:
0.0
}
]
]
optimizer
=
BERTAdam
(
optimizer_parameters
,
optimizer
=
BERTAdam
(
optimizer_parameters
,
lr
=
args
.
learning_rate
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_steps
)
t_total
=
num_train_steps
)
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
global_step
=
0
global_step
=
0
if
args
.
do_train
:
if
args
.
do_train
:
train_features
=
convert_examples_to_features
(
train_features
=
convert_examples_to_features
(
...
@@ -862,8 +867,12 @@ def main():
...
@@ -862,8 +867,12 @@ def main():
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
.
backward
()
loss
.
backward
()
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
optimize_on_cpu
:
model
.
to
(
'cpu'
)
optimizer
.
step
()
# We have accumulated enought gradients
optimizer
.
step
()
# We have accumulated enought gradients
model
.
zero_grad
()
model
.
zero_grad
()
if
args
.
optimize_on_cpu
:
model
.
to
(
device
)
global_step
+=
1
global_step
+=
1
if
args
.
do_predict
:
if
args
.
do_predict
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment