Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6bc082da
Commit
6bc082da
authored
Feb 08, 2019
by
thomwolf
Browse files
updating examples
parent
eb8fda51
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
371 additions
and
734 deletions
+371
-734
examples/run_openai_gpt.py
examples/run_openai_gpt.py
+229
-0
examples/run_transfo_xl.py
examples/run_transfo_xl.py
+142
-0
examples/train_transfo_xl.py
examples/train_transfo_xl.py
+0
-595
examples/transfo_xl_eval.py
examples/transfo_xl_eval.py
+0
-139
No files found.
examples/
trai
n_openai_gpt.py
→
examples/
ru
n_openai_gpt.py
View file @
6bc082da
...
@@ -24,115 +24,20 @@ import os
...
@@ -24,115 +24,20 @@ import os
import
csv
import
csv
import
random
import
random
import
logging
import
logging
from
tqdm
import
tqdm
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
TensorDataset
)
from
sklearn.metrics
import
accuracy_score
from
sklearn.utils
import
shuffle
from
pytorch_pretrained_bert
import
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTTokenizer
,
OpenAIAdam
from
pytorch_pretrained_bert
import
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTTokenizer
,
OpenAIAdam
# from analysis import rocstories as rocstories_analysis
# from datasets import rocstories
# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
# from opt import OpenAIAdam
# from text_utils import TextEncoder
# from utils import (encode_dataset, iter_data,
# ResultLogger, make_path)
# from loss import MultipleChoiceLossCompute
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
iter_apply
(
Xs
,
Ms
,
Ys
):
# fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
logits
=
[]
cost
=
0
with
torch
.
no_grad
():
dh_model
.
eval
()
for
xmb
,
mmb
,
ymb
in
iter_data
(
Xs
,
Ms
,
Ys
,
n_batch
=
n_batch_train
,
truncate
=
False
,
verbose
=
True
):
n
=
len
(
xmb
)
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
YMB
=
torch
.
tensor
(
ymb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
_
,
clf_logits
=
dh_model
(
XMB
)
clf_logits
*=
n
clf_losses
=
compute_loss_fct
(
XMB
,
YMB
,
MMB
,
clf_logits
,
only_return_losses
=
True
)
clf_losses
*=
n
logits
.
append
(
clf_logits
.
to
(
"cpu"
).
numpy
())
cost
+=
clf_losses
.
sum
().
item
()
logits
=
np
.
concatenate
(
logits
,
0
)
return
logits
,
cost
def
iter_predict
(
Xs
,
Ms
):
logits
=
[]
with
torch
.
no_grad
():
dh_model
.
eval
()
for
xmb
,
mmb
in
iter_data
(
Xs
,
Ms
,
n_batch
=
n_batch_train
,
truncate
=
False
,
verbose
=
True
):
n
=
len
(
xmb
)
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
_
,
clf_logits
=
dh_model
(
XMB
)
logits
.
append
(
clf_logits
.
to
(
"cpu"
).
numpy
())
logits
=
np
.
concatenate
(
logits
,
0
)
return
logits
def
log
(
save_dir
,
desc
):
global
best_score
print
(
"Logging"
)
tr_logits
,
tr_cost
=
iter_apply
(
trX
[:
n_valid
],
trM
[:
n_valid
],
trY
[:
n_valid
])
va_logits
,
va_cost
=
iter_apply
(
vaX
,
vaM
,
vaY
)
tr_cost
=
tr_cost
/
len
(
trY
[:
n_valid
])
va_cost
=
va_cost
/
n_valid
tr_acc
=
accuracy_score
(
trY
[:
n_valid
],
np
.
argmax
(
tr_logits
,
1
))
*
100.
va_acc
=
accuracy_score
(
vaY
,
np
.
argmax
(
va_logits
,
1
))
*
100.
logger
.
log
(
n_epochs
=
n_epochs
,
n_updates
=
n_updates
,
tr_cost
=
tr_cost
,
va_cost
=
va_cost
,
tr_acc
=
tr_acc
,
va_acc
=
va_acc
)
print
(
'%d %d %.3f %.3f %.2f %.2f'
%
(
n_epochs
,
n_updates
,
tr_cost
,
va_cost
,
tr_acc
,
va_acc
))
if
submit
:
score
=
va_acc
if
score
>
best_score
:
best_score
=
score
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
torch
.
save
(
dh_model
.
state_dict
(),
make_path
(
path
))
def
predict
(
dataset
,
submission_dir
):
filename
=
filenames
[
dataset
]
pred_fn
=
pred_fns
[
dataset
]
label_decoder
=
label_decoders
[
dataset
]
predictions
=
pred_fn
(
iter_predict
(
teX
,
teM
))
if
label_decoder
is
not
None
:
predictions
=
[
label_decoder
[
prediction
]
for
prediction
in
predictions
]
path
=
os
.
path
.
join
(
submission_dir
,
filename
)
os
.
makedirs
(
os
.
path
.
dirname
(
path
),
exist_ok
=
True
)
with
open
(
path
,
'w'
)
as
f
:
f
.
write
(
'{}
\t
{}
\n
'
.
format
(
'index'
,
'prediction'
))
for
i
,
prediction
in
enumerate
(
predictions
):
f
.
write
(
'{}
\t
{}
\n
'
.
format
(
i
,
prediction
))
def
run_epoch
():
for
xmb
,
mmb
,
ymb
in
iter_data
(
*
shuffle
(
trX
,
trM
,
trYt
,
random_state
=
np
.
random
),
n_batch
=
n_batch_train
,
truncate
=
True
,
verbose
=
True
):
global
n_updates
dh_model
.
train
()
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
YMB
=
torch
.
tensor
(
ymb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
lm_logits
,
clf_logits
=
dh_model
(
XMB
)
compute_loss_fct
(
XMB
,
YMB
,
MMB
,
clf_logits
,
lm_logits
)
n_updates
+=
1
if
n_updates
in
[
1000
,
2000
,
4000
,
8000
,
16000
,
32000
]
and
n_epochs
==
0
:
log
(
save_dir
,
desc
)
def
accuracy
(
out
,
labels
):
def
accuracy
(
out
,
labels
):
outputs
=
np
.
argmax
(
out
,
axis
=
1
)
outputs
=
np
.
argmax
(
out
,
axis
=
1
)
return
np
.
sum
(
outputs
==
labels
)
return
np
.
sum
(
outputs
==
labels
)
...
@@ -147,35 +52,43 @@ def load_rocstories_dataset(dataset_path):
...
@@ -147,35 +52,43 @@ def load_rocstories_dataset(dataset_path):
output
.
append
((
' '
.
join
(
line
[
1
:
5
]),
line
[
5
],
line
[
6
],
int
(
line
[
-
1
])
-
1
))
output
.
append
((
' '
.
join
(
line
[
1
:
5
]),
line
[
5
],
line
[
6
],
int
(
line
[
-
1
])
-
1
))
return
output
return
output
def
pre_process_dataset
(
encoded_dataset
,
max_len
,
start_token
,
delimiter_token
,
clf_token
):
def
pre_process_datasets
(
encoded_datasets
,
max_len
,
start_token
,
delimiter_token
,
clf_token
):
n_batch
=
len
(
dataset
)
""" Pre-process datasets containing lists of
input_ids
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
tuples(story, 1st continuation, 2nd continuation, label)
mc_token_mask
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
max_len
),
-
1
,
dtype
=
np
.
float32
)
In Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
float32
)
input_ids[batch, alternative, :] = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
encoded_dataset
):
"""
with_cont1
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont1
[:
max_len
]
+
[
clf_token
]
tensor_datasets
=
[]
with_cont2
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont2
[:
max_len
]
+
[
clf_token
]
for
dataset
in
encoded_datasets
:
xmb
[
i
,
0
,
:
len
(
with_cont1
)]
=
with_cont1
n_batch
=
len
(
dataset
)
xmb
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
input_ids
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
mc_token_mask
[
i
,
0
,
len
(
with_cont1
)
-
1
]
=
1
mc_token_mask
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
lm_labels
[
i
,
0
,
:
len
(
with_cont1
)
-
1
]
=
with_cont1
[
1
:]
lm_labels
=
np
.
full
((
n_batch
,
2
,
max_len
),
-
1
,
dtype
=
np
.
float32
)
lm_labels
[
i
,
1
,
:
len
(
with_cont2
)
-
1
]
=
with_cont2
[
1
:]
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
float32
)
mc_labels
[
i
]
=
mc_label
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
dataset
):
all_inputs
=
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
with_cont1
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont1
[:
max_len
]
+
[
clf_token
]
all_input_tensors
=
list
(
torch
.
tensor
(
t
)
for
t
in
all_inputs
)
with_cont2
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont2
[:
max_len
]
+
[
clf_token
]
return
all_input_tensors
input_ids
[
i
,
0
,
:
len
(
with_cont1
)]
=
with_cont1
input_ids
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
mc_token_mask
[
i
,
0
,
len
(
with_cont1
)
-
1
]
=
1
lm_labels
[
i
,
0
,
:
len
(
with_cont1
)
-
1
]
=
with_cont1
[
1
:]
if
__name__
==
'__main__'
:
lm_labels
[
i
,
1
,
:
len
(
with_cont2
)
-
1
]
=
with_cont2
[
1
:]
mc_labels
[
i
]
=
mc_label
all_inputs
=
tuple
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
tensor_datasets
.
append
(
tuple
(
torch
.
tensor
(
t
)
for
t
in
all_inputs
))
return
tensor_datasets
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'openai-gpt'
,
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'openai-gpt'
,
help
=
'pretrained model name'
)
help
=
'pretrained model name'
)
parser
.
add_argument
(
'--data_dir'
,
type
=
str
,
default
=
'data/'
)
parser
.
add_argument
(
'--train_dataset'
,
type
=
str
,
default
=
'cloze_test_val__spring2016 - cloze_test_ALL_val.tsv'
)
parser
.
add_argument
(
'--eval_dataset'
,
type
=
str
,
default
=
'test_spring2016.tsv'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
'--num_train_epochs'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--num_train_epochs'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--train_batch_size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--train_batch_size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--eval_batch_size'
,
type
=
int
,
default
=
16
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--warmup_proportion'
,
type
=
float
,
default
=
0.002
)
parser
.
add_argument
(
'--warmup_proportion'
,
type
=
float
,
default
=
0.002
)
...
@@ -194,7 +107,7 @@ if __name__ == '__main__':
...
@@ -194,7 +107,7 @@ if __name__ == '__main__':
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
n_gpu
=
torch
.
cuda
.
device_count
()
logger
.
info
(
"device
"
,
device
,
"
n_gpu
"
,
n_gpu
)
logger
.
info
(
"device
: {}
, n_gpu
{}"
.
format
(
device
,
n_gpu
)
)
# Load tokenizer and model
# Load tokenizer and model
# This loading functions also add new tokens and embeddings called `special tokens`
# This loading functions also add new tokens and embeddings called `special tokens`
...
@@ -204,22 +117,34 @@ if __name__ == '__main__':
...
@@ -204,22 +117,34 @@ if __name__ == '__main__':
special_tokens_ids
=
list
(
tokenizer
.
convert_tokens_to_ids
(
token
)
for
token
in
special_tokens
)
special_tokens_ids
=
list
(
tokenizer
.
convert_tokens_to_ids
(
token
)
for
token
in
special_tokens
)
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
args
.
model_name
,
num_special_tokens
=
len
(
special_tokens
))
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
args
.
model_name
,
num_special_tokens
=
len
(
special_tokens
))
# Load
the dataset and prepare the inpu
ts
# Load
and encode the datase
ts
logger
.
info
(
"Encoding dataset..."
)
logger
.
info
(
"Encoding dataset..."
)
dataset
=
load_rocstories_dataset
(
args
.
dataset_path
)
train_dataset
=
load_rocstories_dataset
(
args
.
train_dataset
)
tokenized_dataset
=
list
(
list
(
tokenizer
.
tokenize
(
x
)
for
x
in
instance
)
for
instance
in
dataset
)
eval_datset
=
load_rocstories_dataset
(
args
.
eval_datset
)
encoded_dataset
=
list
(
list
(
tokenizer
.
convert_tokens_to_ids
(
x
)
for
x
in
instance
)
for
instance
in
tokenized_dataset
)
datasets
=
(
train_dataset
,
eval_datset
)
tokenized_datasets
=
tuple
(
list
(
list
(
tokenizer
.
tokenize
(
x
)
for
x
in
instance
)
max_input_length
=
max
(
len
(
story
)
+
max
(
len
(
cont1
),
len
(
cont2
))
+
3
for
story
,
cont1
,
cont2
,
_
in
encoded_dataset
)
for
instance
in
dataset
)
for
dataset
in
datasets
)
encoded_datasets
=
tuple
(
list
(
list
(
tokenizer
.
convert_tokens_to_ids
(
x
)
for
x
in
instance
)
for
instance
in
dataset
)
for
dataset
in
tokenized_datasets
)
# Compute the mex input length for the Transformer
max_input_length
=
max
(
len
(
story
)
+
max
(
len
(
cont1
),
len
(
cont2
))
+
3
\
for
dataset
in
encoded_datasets
for
story
,
cont1
,
cont2
,
_
in
dataset
)
max_input_length
=
min
(
max_input_length
,
model
.
config
.
n_positions
)
# Max size of input for the pre-trained model
max_input_length
=
min
(
max_input_length
,
model
.
config
.
n_positions
)
# Max size of input for the pre-trained model
max_sub_part_length
=
max_input_length
//
2
-
2
max_sub_part_length
=
max_input_length
//
2
-
2
# Prepare dataloader
# Prepare inputs tensors and dataloaders
dataset_tensors
=
pre_process_dataset
(
encoded_dataset
,
max_sub_part_length
,
*
special_tokens_ids
)
tensor_datasets
=
pre_process_datasets
(
encoded_datasets
,
max_sub_part_length
,
*
special_tokens_ids
)
train_data
=
TensorDataset
(
*
dataset_tensors
)
train_tensor_dataset
,
eval_tensor_dataset
=
tensor_datasets
[
0
],
tensor_datasets
[
1
]
train_data
=
TensorDataset
(
*
train_tensor_dataset
)
train_sampler
=
RandomSampler
(
train_data
)
train_sampler
=
RandomSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
eval_data
=
TensorDataset
(
*
eval_tensor_dataset
)
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
...
@@ -232,11 +157,10 @@ if __name__ == '__main__':
...
@@ -232,11 +157,10 @@ if __name__ == '__main__':
lr
=
args
.
learning_rate
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
warmup
=
args
.
warmup_proportion
,
max_grad_norm
=
args
.
max_grad_norm
,
max_grad_norm
=
args
.
max_grad_norm
,
weight_decay
=
ar
s
g
.
weight_decay
,
weight_decay
=
arg
s
.
weight_decay
,
t_total
=
num_train_optimization_steps
)
t_total
=
num_train_optimization_steps
)
if
args
.
do_train
:
if
args
.
do_train
:
global_step
=
0
nb_tr_steps
=
0
nb_tr_steps
=
0
tr_loss
=
0
tr_loss
=
0
model
.
train
()
model
.
train
()
...
@@ -249,6 +173,7 @@ if __name__ == '__main__':
...
@@ -249,6 +173,7 @@ if __name__ == '__main__':
losses
=
model
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
losses
=
model
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
loss
=
args
.
lm_coef
*
losses
[
0
]
+
losses
[
1
]
loss
=
args
.
lm_coef
*
losses
[
0
]
+
losses
[
1
]
loss
.
backward
()
loss
.
backward
()
optimizer
.
step
()
tr_loss
+=
loss
.
item
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
nb_tr_steps
+=
1
...
@@ -261,44 +186,26 @@ if __name__ == '__main__':
...
@@ -261,44 +186,26 @@ if __name__ == '__main__':
# Load a trained model that you have fine-tuned
# Load a trained model that you have fine-tuned
model_state_dict
=
torch
.
load
(
output_model_file
)
model_state_dict
=
torch
.
load
(
output_model_file
)
model
=
OpenAIGPTDoubleHeadsModel
(
args
.
mode
,
state_dict
=
model_state_dict
,
num_labels
=
num_labels
)
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
args
.
model_name
,
state_dict
=
model_state_dict
,
num_special_tokens
=
len
(
special_tokens
))
model
.
to
(
device
)
model
.
to
(
device
)
if
args
.
do_eval
:
if
args
.
do_eval
:
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
model
.
eval
()
eval_loss
,
eval_accuracy
=
0
,
0
eval_loss
,
eval_accuracy
=
0
,
0
nb_eval_steps
,
nb_eval_examples
=
0
,
0
nb_eval_steps
,
nb_eval_examples
=
0
,
0
for
batch
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
=
input_ids
.
to
(
device
)
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
=
batch
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
with
torch
.
no_grad
():
tmp_eval
_loss
=
model
(
input_ids
,
segment_ids
,
input_mask
,
label
_id
s
)
_
,
mc
_loss
=
model
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_
labels
)
logits
=
model
(
input_ids
,
segment_ids
,
input
_mask
)
_
,
mc_
logits
=
model
(
input_ids
,
mc_token
_mask
)
logits
=
logits
.
detach
().
cpu
().
numpy
()
mc_
logits
=
mc_
logits
.
detach
().
cpu
().
numpy
()
label
_id
s
=
label
_id
s
.
to
(
'cpu'
).
numpy
()
mc_
labels
=
mc_
labels
.
to
(
'cpu'
).
numpy
()
tmp_eval_accuracy
=
accuracy
(
logits
,
label
_id
s
)
tmp_eval_accuracy
=
accuracy
(
mc_
logits
,
mc_
labels
)
eval_loss
+=
tmp_eval
_loss
.
mean
().
item
()
eval_loss
+=
mc
_loss
.
mean
().
item
()
eval_accuracy
+=
tmp_eval_accuracy
eval_accuracy
+=
tmp_eval_accuracy
nb_eval_examples
+=
input_ids
.
size
(
0
)
nb_eval_examples
+=
input_ids
.
size
(
0
)
...
@@ -306,11 +213,10 @@ if __name__ == '__main__':
...
@@ -306,11 +213,10 @@ if __name__ == '__main__':
eval_loss
=
eval_loss
/
nb_eval_steps
eval_loss
=
eval_loss
/
nb_eval_steps
eval_accuracy
=
eval_accuracy
/
nb_eval_examples
eval_accuracy
=
eval_accuracy
/
nb_eval_examples
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
train_
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
result
=
{
'eval_loss'
:
eval_loss
,
result
=
{
'eval_loss'
:
eval_loss
,
'eval_accuracy'
:
eval_accuracy
,
'eval_accuracy'
:
eval_accuracy
,
'global_step'
:
global_step
,
'train_loss'
:
train_loss
}
'loss'
:
loss
}
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
with
open
(
output_eval_file
,
"w"
)
as
writer
:
...
@@ -319,26 +225,5 @@ if __name__ == '__main__':
...
@@ -319,26 +225,5 @@ if __name__ == '__main__':
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"
__main__
"
:
if
__name__
==
'
__main__
'
:
main
()
main
()
n_updates
=
0
n_epochs
=
0
if
dataset
!=
'stsb'
:
trYt
=
trY
if
submit
:
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
torch
.
save
(
dh_model
.
state_dict
(),
make_path
(
path
))
best_score
=
0
for
i
in
range
(
args
.
n_iter
):
print
(
"running epoch"
,
i
)
run_epoch
()
n_epochs
+=
1
log
(
save_dir
,
desc
)
if
submit
:
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
dh_model
.
load_state_dict
(
torch
.
load
(
path
))
predict
(
dataset
,
args
.
submission_dir
)
if
args
.
analysis
:
rocstories_analysis
(
data_dir
,
os
.
path
.
join
(
args
.
submission_dir
,
'ROCStories.tsv'
),
os
.
path
.
join
(
log_dir
,
'rocstories.jsonl'
))
examples/run_transfo_xl.py
0 → 100644
View file @
6bc082da
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Transformer XL model evaluation script.
Adapted from https://github.com/kimiyoung/transformer-xl.
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
"""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
logging
import
time
import
math
import
torch
from
pytorch_pretrained_bert
import
TransfoXLModel
,
TransfoXLCorpus
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Transformer Language Model'
)
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'transfo-xl-wt103'
,
help
=
'pretrained model name'
)
parser
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'test'
,
choices
=
[
'all'
,
'valid'
,
'test'
],
help
=
'which split to evaluate'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
10
,
help
=
'batch size'
)
parser
.
add_argument
(
'--tgt_len'
,
type
=
int
,
default
=
128
,
help
=
'number of tokens to predict'
)
parser
.
add_argument
(
'--ext_len'
,
type
=
int
,
default
=
0
,
help
=
'length of the extended context'
)
parser
.
add_argument
(
'--mem_len'
,
type
=
int
,
default
=
1600
,
help
=
'length of the retained previous heads'
)
parser
.
add_argument
(
'--clamp_len'
,
type
=
int
,
default
=
1000
,
help
=
'max positional embedding index'
)
parser
.
add_argument
(
'--cuda'
,
action
=
'store_true'
,
help
=
'use CUDA'
)
parser
.
add_argument
(
'--work_dir'
,
type
=
str
,
required
=
True
,
help
=
'path to the work_dir'
)
parser
.
add_argument
(
'--no_log'
,
action
=
'store_true'
,
help
=
'do not log the eval result'
)
parser
.
add_argument
(
'--same_length'
,
action
=
'store_true'
,
help
=
'set same length attention with masking'
)
args
=
parser
.
parse_args
()
assert
args
.
ext_len
>=
0
,
'extended context length must be non-negative'
device
=
torch
.
device
(
"cuda"
if
args
.
cuda
else
"cpu"
)
# Load a pre-processed dataset
# You can also build the corpus yourself using TransfoXLCorpus methods
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
# and tokenizing the dataset
# The pre-processed corpus is a convertion (using the conversion script )
corpus
=
TransfoXLCorpus
.
from_pretrained
(
args
.
model_name
)
ntokens
=
len
(
corpus
.
vocab
)
va_iter
=
corpus
.
get_iterator
(
'valid'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
te_iter
=
corpus
.
get_iterator
(
'test'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
# Load a pre-trained model
model
=
TransfoXLModel
.
from_pretrained
(
args
.
model_name
)
model
=
model
.
to
(
device
)
logger
.
info
(
'Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'
.
format
(
args
.
batch_size
,
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
,
args
.
clamp_len
))
model
.
reset_length
(
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
)
if
args
.
clamp_len
>
0
:
model
.
clamp_len
=
args
.
clamp_len
if
args
.
same_length
:
model
.
same_length
=
True
###############################################################################
# Evaluation code
###############################################################################
def
evaluate
(
eval_iter
):
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_len
,
total_loss
=
0
,
0.
start_time
=
time
.
time
()
with
torch
.
no_grad
():
mems
=
tuple
()
for
idx
,
(
data
,
target
,
seq_len
)
in
enumerate
(
eval_iter
):
ret
=
model
(
data
,
target
,
*
mems
)
loss
,
mems
=
ret
loss
=
loss
.
mean
()
total_loss
+=
seq_len
*
loss
.
item
()
total_len
+=
seq_len
total_time
=
time
.
time
()
-
start_time
logger
.
info
(
'Time : {:.2f}s, {:.2f}ms/segment'
.
format
(
total_time
,
1000
*
total_time
/
(
idx
+
1
)))
return
total_loss
/
total_len
# Run on test data.
if
args
.
split
==
'all'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
evaluate
(
va_iter
)
elif
args
.
split
==
'valid'
:
valid_loss
=
evaluate
(
va_iter
)
test_loss
=
None
elif
args
.
split
==
'test'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
None
def
format_log
(
loss
,
split
):
log_str
=
'| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '
.
format
(
split
,
loss
,
math
.
exp
(
loss
))
return
log_str
log_str
=
''
if
valid_loss
is
not
None
:
log_str
+=
format_log
(
valid_loss
,
'valid'
)
if
test_loss
is
not
None
:
log_str
+=
format_log
(
test_loss
,
'test'
)
logger
.
info
(
'='
*
100
)
logger
.
info
(
log_str
)
logger
.
info
(
'='
*
100
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
examples/train_transfo_xl.py
deleted
100644 → 0
View file @
eb8fda51
This diff is collapsed.
Click to expand it.
examples/transfo_xl_eval.py
deleted
100644 → 0
View file @
eb8fda51
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Transformer XL model evaluation script.
Adapted from https://github.com/kimiyoung/transformer-xl.
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
"""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
logging
import
time
import
math
import
torch
from
pytorch_pretrained_bert
import
TransfoXLModel
,
TransfoXLCorpus
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Transformer Language Model'
)
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'transfo-xl-wt103'
,
help
=
'pretrained model name'
)
parser
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'test'
,
choices
=
[
'all'
,
'valid'
,
'test'
],
help
=
'which split to evaluate'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
10
,
help
=
'batch size'
)
parser
.
add_argument
(
'--tgt_len'
,
type
=
int
,
default
=
128
,
help
=
'number of tokens to predict'
)
parser
.
add_argument
(
'--ext_len'
,
type
=
int
,
default
=
0
,
help
=
'length of the extended context'
)
parser
.
add_argument
(
'--mem_len'
,
type
=
int
,
default
=
1600
,
help
=
'length of the retained previous heads'
)
parser
.
add_argument
(
'--clamp_len'
,
type
=
int
,
default
=
1000
,
help
=
'max positional embedding index'
)
parser
.
add_argument
(
'--cuda'
,
action
=
'store_true'
,
help
=
'use CUDA'
)
parser
.
add_argument
(
'--work_dir'
,
type
=
str
,
required
=
True
,
help
=
'path to the work_dir'
)
parser
.
add_argument
(
'--no_log'
,
action
=
'store_true'
,
help
=
'do not log the eval result'
)
parser
.
add_argument
(
'--same_length'
,
action
=
'store_true'
,
help
=
'set same length attention with masking'
)
args
=
parser
.
parse_args
()
assert
args
.
ext_len
>=
0
,
'extended context length must be non-negative'
device
=
torch
.
device
(
"cuda"
if
args
.
cuda
else
"cpu"
)
# Load a pre-processed dataset
# You can also build the corpus yourself using TransfoXLCorpus methods
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
# and tokenizing the dataset
# The pre-processed corpus is a convertion (using the conversion script )
corpus
=
TransfoXLCorpus
.
from_pretrained
(
args
.
model_name
)
ntokens
=
len
(
corpus
.
vocab
)
va_iter
=
corpus
.
get_iterator
(
'valid'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
te_iter
=
corpus
.
get_iterator
(
'test'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
# Load a pre-trained model
model
=
TransfoXLModel
.
from_pretrained
(
args
.
model_name
)
model
=
model
.
to
(
device
)
logger
.
info
(
'Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'
.
format
(
args
.
batch_size
,
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
,
args
.
clamp_len
))
model
.
reset_length
(
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
)
if
args
.
clamp_len
>
0
:
model
.
clamp_len
=
args
.
clamp_len
if
args
.
same_length
:
model
.
same_length
=
True
###############################################################################
# Evaluation code
###############################################################################
def
evaluate
(
eval_iter
):
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_len
,
total_loss
=
0
,
0.
start_time
=
time
.
time
()
with
torch
.
no_grad
():
mems
=
tuple
()
for
idx
,
(
data
,
target
,
seq_len
)
in
enumerate
(
eval_iter
):
ret
=
model
(
data
,
target
,
*
mems
)
loss
,
mems
=
ret
loss
=
loss
.
mean
()
total_loss
+=
seq_len
*
loss
.
item
()
total_len
+=
seq_len
total_time
=
time
.
time
()
-
start_time
logger
.
info
(
'Time : {:.2f}s, {:.2f}ms/segment'
.
format
(
total_time
,
1000
*
total_time
/
(
idx
+
1
)))
return
total_loss
/
total_len
# Run on test data.
if
args
.
split
==
'all'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
evaluate
(
va_iter
)
elif
args
.
split
==
'valid'
:
valid_loss
=
evaluate
(
va_iter
)
test_loss
=
None
elif
args
.
split
==
'test'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
None
def
format_log
(
loss
,
split
):
log_str
=
'| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '
.
format
(
split
,
loss
,
math
.
exp
(
loss
))
return
log_str
log_str
=
''
if
valid_loss
is
not
None
:
log_str
+=
format_log
(
valid_loss
,
'valid'
)
if
test_loss
is
not
None
:
log_str
+=
format_log
(
test_loss
,
'test'
)
logger
.
info
(
'='
*
100
)
logger
.
info
(
log_str
)
logger
.
info
(
'='
*
100
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment