Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f63ff536
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "f9a2a9e32bf4f6dc128decd0c124fa1f5507532e"
Commit
f63ff536
authored
Jul 20, 2019
by
Rabeeh KARIMI
Browse files
fixed version issues in run_openai_gpt
parent
a6154990
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
12 deletions
+25
-12
examples/single_model_scripts/run_openai_gpt.py
examples/single_model_scripts/run_openai_gpt.py
+25
-12
No files found.
examples/single_model_scripts/run_openai_gpt.py
View file @
f63ff536
...
@@ -40,7 +40,8 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
...
@@ -40,7 +40,8 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset
)
TensorDataset
)
from
pytorch_transformers
import
(
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTTokenizer
,
from
pytorch_transformers
import
(
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTTokenizer
,
AdamW
,
cached_path
,
WEIGHTS_NAME
,
CONFIG_NAME
)
AdamW
,
cached_path
,
WEIGHTS_NAME
,
CONFIG_NAME
,
WarmupLinearSchedule
)
ROCSTORIES_URL
=
"https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
ROCSTORIES_URL
=
"https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
...
@@ -104,9 +105,18 @@ def main():
...
@@ -104,9 +105,18 @@ def main():
parser
.
add_argument
(
'--num_train_epochs'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--num_train_epochs'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--train_batch_size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--train_batch_size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--eval_batch_size'
,
type
=
int
,
default
=
16
)
parser
.
add_argument
(
'--eval_batch_size'
,
type
=
int
,
default
=
16
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--max_steps"
,
default
=-
1
,
type
=
int
,
help
=
"If > 0: set total number of training
\
steps to perform. Override num_train_epochs."
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before
\
performing a backward/update pass."
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--warmup_proportion'
,
type
=
float
,
default
=
0.002
)
parser
.
add_argument
(
"--warmup_steps"
,
default
=
0
,
type
=
int
,
help
=
"Linear warmup over warmup_steps."
)
parser
.
add_argument
(
'--lr_schedule'
,
type
=
str
,
default
=
'warmup_linear'
)
parser
.
add_argument
(
'--lr_schedule'
,
type
=
str
,
default
=
'warmup_linear'
)
parser
.
add_argument
(
'--weight_decay'
,
type
=
float
,
default
=
0.01
)
parser
.
add_argument
(
'--weight_decay'
,
type
=
float
,
default
=
0.01
)
parser
.
add_argument
(
'--lm_coef'
,
type
=
float
,
default
=
0.9
)
parser
.
add_argument
(
'--lm_coef'
,
type
=
float
,
default
=
0.9
)
...
@@ -184,19 +194,22 @@ def main():
...
@@ -184,19 +194,22 @@ def main():
# Prepare optimizer
# Prepare optimizer
if
args
.
do_train
:
if
args
.
do_train
:
if
args
.
max_steps
>
0
:
t_total
=
args
.
max_steps
args
.
num_train_epochs
=
args
.
max_steps
//
\
(
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
)
+
1
else
:
t_total
=
len
(
train_dataloader
)
\
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
param_optimizer
=
list
(
model
.
named_parameters
())
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
]
num_train_optimization_steps
=
len
(
train_dataloader
)
*
args
.
num_train_epochs
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
eps
=
args
.
adam_epsilon
)
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
scheduler
=
WarmupLinearSchedule
(
optimizer
,
warmup_steps
=
args
.
warmup_steps
,
t_total
=
t_total
)
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
max_grad_norm
=
args
.
max_grad_norm
,
weight_decay
=
args
.
weight_decay
,
t_total
=
num_train_optimization_steps
)
if
args
.
do_train
:
if
args
.
do_train
:
nb_tr_steps
,
tr_loss
,
exp_average_loss
=
0
,
0
,
None
nb_tr_steps
,
tr_loss
,
exp_average_loss
=
0
,
0
,
None
...
@@ -211,12 +224,13 @@ def main():
...
@@ -211,12 +224,13 @@ def main():
losses
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
)
losses
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
)
loss
=
args
.
lm_coef
*
losses
[
0
]
+
losses
[
1
]
loss
=
args
.
lm_coef
*
losses
[
0
]
+
losses
[
1
]
loss
.
backward
()
loss
.
backward
()
scheduler
.
step
()
optimizer
.
step
()
optimizer
.
step
()
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
tr_loss
+=
loss
.
item
()
tr_loss
+=
loss
.
item
()
exp_average_loss
=
loss
.
item
()
if
exp_average_loss
is
None
else
0.7
*
exp_average_loss
+
0.3
*
loss
.
item
()
exp_average_loss
=
loss
.
item
()
if
exp_average_loss
is
None
else
0.7
*
exp_average_loss
+
0.3
*
loss
.
item
()
nb_tr_steps
+=
1
nb_tr_steps
+=
1
tqdm_bar
.
desc
=
"Training loss: {:.2e} lr: {:.2e}"
.
format
(
exp_average_loss
,
optimiz
er
.
get_lr
()[
0
])
tqdm_bar
.
desc
=
"Training loss: {:.2e} lr: {:.2e}"
.
format
(
exp_average_loss
,
schedul
er
.
get_lr
()[
0
])
# Save a trained model
# Save a trained model
if
args
.
do_train
:
if
args
.
do_train
:
...
@@ -244,8 +258,7 @@ def main():
...
@@ -244,8 +258,7 @@ def main():
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
=
batch
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
=
batch
with
torch
.
no_grad
():
with
torch
.
no_grad
():
_
,
mc_loss
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
)
_
,
mc_loss
,
_
,
mc_logits
=
model
(
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
)
_
,
mc_logits
=
model
(
input_ids
,
mc_token_ids
)
mc_logits
=
mc_logits
.
detach
().
cpu
().
numpy
()
mc_logits
=
mc_logits
.
detach
().
cpu
().
numpy
()
mc_labels
=
mc_labels
.
to
(
'cpu'
).
numpy
()
mc_labels
=
mc_labels
.
to
(
'cpu'
).
numpy
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment