Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
6bc082da
"...internvl3_pytorch.git" did not exist on "26e5928087c6b5a0ced4ad0be32ca1cf22b8a0e6"
Commit
6bc082da
authored
Feb 08, 2019
by
thomwolf
Browse files
updating examples
parent
eb8fda51
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
371 additions
and
734 deletions
+371
-734
examples/run_openai_gpt.py
examples/run_openai_gpt.py
+229
-0
examples/run_transfo_xl.py
examples/run_transfo_xl.py
+142
-0
examples/train_transfo_xl.py
examples/train_transfo_xl.py
+0
-595
examples/transfo_xl_eval.py
examples/transfo_xl_eval.py
+0
-139
No files found.
examples/
trai
n_openai_gpt.py
→
examples/
ru
n_openai_gpt.py
View file @
6bc082da
...
@@ -24,115 +24,20 @@ import os
...
@@ -24,115 +24,20 @@ import os
import
csv
import
csv
import
random
import
random
import
logging
import
logging
from
tqdm
import
tqdm
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
TensorDataset
)
from
sklearn.metrics
import
accuracy_score
from
sklearn.utils
import
shuffle
from
pytorch_pretrained_bert
import
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTTokenizer
,
OpenAIAdam
from
pytorch_pretrained_bert
import
OpenAIGPTDoubleHeadsModel
,
OpenAIGPTTokenizer
,
OpenAIAdam
# from analysis import rocstories as rocstories_analysis
# from datasets import rocstories
# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
# from opt import OpenAIAdam
# from text_utils import TextEncoder
# from utils import (encode_dataset, iter_data,
# ResultLogger, make_path)
# from loss import MultipleChoiceLossCompute
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
iter_apply
(
Xs
,
Ms
,
Ys
):
# fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
logits
=
[]
cost
=
0
with
torch
.
no_grad
():
dh_model
.
eval
()
for
xmb
,
mmb
,
ymb
in
iter_data
(
Xs
,
Ms
,
Ys
,
n_batch
=
n_batch_train
,
truncate
=
False
,
verbose
=
True
):
n
=
len
(
xmb
)
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
YMB
=
torch
.
tensor
(
ymb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
_
,
clf_logits
=
dh_model
(
XMB
)
clf_logits
*=
n
clf_losses
=
compute_loss_fct
(
XMB
,
YMB
,
MMB
,
clf_logits
,
only_return_losses
=
True
)
clf_losses
*=
n
logits
.
append
(
clf_logits
.
to
(
"cpu"
).
numpy
())
cost
+=
clf_losses
.
sum
().
item
()
logits
=
np
.
concatenate
(
logits
,
0
)
return
logits
,
cost
def
iter_predict
(
Xs
,
Ms
):
logits
=
[]
with
torch
.
no_grad
():
dh_model
.
eval
()
for
xmb
,
mmb
in
iter_data
(
Xs
,
Ms
,
n_batch
=
n_batch_train
,
truncate
=
False
,
verbose
=
True
):
n
=
len
(
xmb
)
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
_
,
clf_logits
=
dh_model
(
XMB
)
logits
.
append
(
clf_logits
.
to
(
"cpu"
).
numpy
())
logits
=
np
.
concatenate
(
logits
,
0
)
return
logits
def
log
(
save_dir
,
desc
):
global
best_score
print
(
"Logging"
)
tr_logits
,
tr_cost
=
iter_apply
(
trX
[:
n_valid
],
trM
[:
n_valid
],
trY
[:
n_valid
])
va_logits
,
va_cost
=
iter_apply
(
vaX
,
vaM
,
vaY
)
tr_cost
=
tr_cost
/
len
(
trY
[:
n_valid
])
va_cost
=
va_cost
/
n_valid
tr_acc
=
accuracy_score
(
trY
[:
n_valid
],
np
.
argmax
(
tr_logits
,
1
))
*
100.
va_acc
=
accuracy_score
(
vaY
,
np
.
argmax
(
va_logits
,
1
))
*
100.
logger
.
log
(
n_epochs
=
n_epochs
,
n_updates
=
n_updates
,
tr_cost
=
tr_cost
,
va_cost
=
va_cost
,
tr_acc
=
tr_acc
,
va_acc
=
va_acc
)
print
(
'%d %d %.3f %.3f %.2f %.2f'
%
(
n_epochs
,
n_updates
,
tr_cost
,
va_cost
,
tr_acc
,
va_acc
))
if
submit
:
score
=
va_acc
if
score
>
best_score
:
best_score
=
score
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
torch
.
save
(
dh_model
.
state_dict
(),
make_path
(
path
))
def
predict
(
dataset
,
submission_dir
):
filename
=
filenames
[
dataset
]
pred_fn
=
pred_fns
[
dataset
]
label_decoder
=
label_decoders
[
dataset
]
predictions
=
pred_fn
(
iter_predict
(
teX
,
teM
))
if
label_decoder
is
not
None
:
predictions
=
[
label_decoder
[
prediction
]
for
prediction
in
predictions
]
path
=
os
.
path
.
join
(
submission_dir
,
filename
)
os
.
makedirs
(
os
.
path
.
dirname
(
path
),
exist_ok
=
True
)
with
open
(
path
,
'w'
)
as
f
:
f
.
write
(
'{}
\t
{}
\n
'
.
format
(
'index'
,
'prediction'
))
for
i
,
prediction
in
enumerate
(
predictions
):
f
.
write
(
'{}
\t
{}
\n
'
.
format
(
i
,
prediction
))
def
run_epoch
():
for
xmb
,
mmb
,
ymb
in
iter_data
(
*
shuffle
(
trX
,
trM
,
trYt
,
random_state
=
np
.
random
),
n_batch
=
n_batch_train
,
truncate
=
True
,
verbose
=
True
):
global
n_updates
dh_model
.
train
()
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
YMB
=
torch
.
tensor
(
ymb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
lm_logits
,
clf_logits
=
dh_model
(
XMB
)
compute_loss_fct
(
XMB
,
YMB
,
MMB
,
clf_logits
,
lm_logits
)
n_updates
+=
1
if
n_updates
in
[
1000
,
2000
,
4000
,
8000
,
16000
,
32000
]
and
n_epochs
==
0
:
log
(
save_dir
,
desc
)
def
accuracy
(
out
,
labels
):
def
accuracy
(
out
,
labels
):
outputs
=
np
.
argmax
(
out
,
axis
=
1
)
outputs
=
np
.
argmax
(
out
,
axis
=
1
)
return
np
.
sum
(
outputs
==
labels
)
return
np
.
sum
(
outputs
==
labels
)
...
@@ -147,35 +52,43 @@ def load_rocstories_dataset(dataset_path):
...
@@ -147,35 +52,43 @@ def load_rocstories_dataset(dataset_path):
output
.
append
((
' '
.
join
(
line
[
1
:
5
]),
line
[
5
],
line
[
6
],
int
(
line
[
-
1
])
-
1
))
output
.
append
((
' '
.
join
(
line
[
1
:
5
]),
line
[
5
],
line
[
6
],
int
(
line
[
-
1
])
-
1
))
return
output
return
output
def
pre_process_dataset
(
encoded_dataset
,
max_len
,
start_token
,
delimiter_token
,
clf_token
):
def
pre_process_datasets
(
encoded_datasets
,
max_len
,
start_token
,
delimiter_token
,
clf_token
):
""" Pre-process datasets containing lists of
tuples(story, 1st continuation, 2nd continuation, label)
In Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
input_ids[batch, alternative, :] = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
"""
tensor_datasets
=
[]
for
dataset
in
encoded_datasets
:
n_batch
=
len
(
dataset
)
n_batch
=
len
(
dataset
)
input_ids
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
input_ids
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
mc_token_mask
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
mc_token_mask
=
np
.
zeros
((
n_batch
,
2
,
max_len
),
dtype
=
np
.
int32
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
max_len
),
-
1
,
dtype
=
np
.
float32
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
max_len
),
-
1
,
dtype
=
np
.
float32
)
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
float32
)
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
float32
)
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
encoded_
dataset
):
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
dataset
):
with_cont1
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont1
[:
max_len
]
+
[
clf_token
]
with_cont1
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont1
[:
max_len
]
+
[
clf_token
]
with_cont2
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont2
[:
max_len
]
+
[
clf_token
]
with_cont2
=
[
start_token
]
+
story
[:
max_len
]
+
[
delimiter_token
]
+
cont2
[:
max_len
]
+
[
clf_token
]
xmb
[
i
,
0
,
:
len
(
with_cont1
)]
=
with_cont1
input_ids
[
i
,
0
,
:
len
(
with_cont1
)]
=
with_cont1
xmb
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
input_ids
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
mc_token_mask
[
i
,
0
,
len
(
with_cont1
)
-
1
]
=
1
mc_token_mask
[
i
,
0
,
len
(
with_cont1
)
-
1
]
=
1
lm_labels
[
i
,
0
,
:
len
(
with_cont1
)
-
1
]
=
with_cont1
[
1
:]
lm_labels
[
i
,
0
,
:
len
(
with_cont1
)
-
1
]
=
with_cont1
[
1
:]
lm_labels
[
i
,
1
,
:
len
(
with_cont2
)
-
1
]
=
with_cont2
[
1
:]
lm_labels
[
i
,
1
,
:
len
(
with_cont2
)
-
1
]
=
with_cont2
[
1
:]
mc_labels
[
i
]
=
mc_label
mc_labels
[
i
]
=
mc_label
all_inputs
=
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
all_inputs
=
tuple
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
all_input_tensors
=
list
(
torch
.
tensor
(
t
)
for
t
in
all_inputs
)
tensor_datasets
.
append
(
tuple
(
torch
.
tensor
(
t
)
for
t
in
all_inputs
))
return
all_input_tensors
return
tensor_datasets
if
__name__
==
'__
main
__'
:
def
main
()
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'openai-gpt'
,
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'openai-gpt'
,
help
=
'pretrained model name'
)
help
=
'pretrained model name'
)
parser
.
add_argument
(
'--data_dir'
,
type
=
str
,
default
=
'data/'
)
parser
.
add_argument
(
'--train_dataset'
,
type
=
str
,
default
=
'cloze_test_val__spring2016 - cloze_test_ALL_val.tsv'
)
parser
.
add_argument
(
'--eval_dataset'
,
type
=
str
,
default
=
'test_spring2016.tsv'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
'--num_train_epochs'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--num_train_epochs'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--train_batch_size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--train_batch_size'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--eval_batch_size'
,
type
=
int
,
default
=
16
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--learning_rate'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--warmup_proportion'
,
type
=
float
,
default
=
0.002
)
parser
.
add_argument
(
'--warmup_proportion'
,
type
=
float
,
default
=
0.002
)
...
@@ -194,7 +107,7 @@ if __name__ == '__main__':
...
@@ -194,7 +107,7 @@ if __name__ == '__main__':
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
n_gpu
=
torch
.
cuda
.
device_count
()
logger
.
info
(
"device
"
,
device
,
"
n_gpu
"
,
n_gpu
)
logger
.
info
(
"device
: {}
, n_gpu
{}"
.
format
(
device
,
n_gpu
)
)
# Load tokenizer and model
# Load tokenizer and model
# This loading functions also add new tokens and embeddings called `special tokens`
# This loading functions also add new tokens and embeddings called `special tokens`
...
@@ -204,22 +117,34 @@ if __name__ == '__main__':
...
@@ -204,22 +117,34 @@ if __name__ == '__main__':
special_tokens_ids
=
list
(
tokenizer
.
convert_tokens_to_ids
(
token
)
for
token
in
special_tokens
)
special_tokens_ids
=
list
(
tokenizer
.
convert_tokens_to_ids
(
token
)
for
token
in
special_tokens
)
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
args
.
model_name
,
num_special_tokens
=
len
(
special_tokens
))
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
args
.
model_name
,
num_special_tokens
=
len
(
special_tokens
))
# Load
the dataset and prepare the inpu
ts
# Load
and encode the datase
ts
logger
.
info
(
"Encoding dataset..."
)
logger
.
info
(
"Encoding dataset..."
)
dataset
=
load_rocstories_dataset
(
args
.
dataset_path
)
train_dataset
=
load_rocstories_dataset
(
args
.
train_dataset
)
tokenized_dataset
=
list
(
list
(
tokenizer
.
tokenize
(
x
)
for
x
in
instance
)
for
instance
in
dataset
)
eval_datset
=
load_rocstories_dataset
(
args
.
eval_datset
)
encoded_dataset
=
list
(
list
(
tokenizer
.
convert_tokens_to_ids
(
x
)
for
x
in
instance
)
for
instance
in
tokenized_dataset
)
datasets
=
(
train_dataset
,
eval_datset
)
tokenized_datasets
=
tuple
(
list
(
list
(
tokenizer
.
tokenize
(
x
)
for
x
in
instance
)
max_input_length
=
max
(
len
(
story
)
+
max
(
len
(
cont1
),
len
(
cont2
))
+
3
for
story
,
cont1
,
cont2
,
_
in
encoded_dataset
)
for
instance
in
dataset
)
for
dataset
in
datasets
)
encoded_datasets
=
tuple
(
list
(
list
(
tokenizer
.
convert_tokens_to_ids
(
x
)
for
x
in
instance
)
for
instance
in
dataset
)
for
dataset
in
tokenized_datasets
)
# Compute the mex input length for the Transformer
max_input_length
=
max
(
len
(
story
)
+
max
(
len
(
cont1
),
len
(
cont2
))
+
3
\
for
dataset
in
encoded_datasets
for
story
,
cont1
,
cont2
,
_
in
dataset
)
max_input_length
=
min
(
max_input_length
,
model
.
config
.
n_positions
)
# Max size of input for the pre-trained model
max_input_length
=
min
(
max_input_length
,
model
.
config
.
n_positions
)
# Max size of input for the pre-trained model
max_sub_part_length
=
max_input_length
//
2
-
2
max_sub_part_length
=
max_input_length
//
2
-
2
# Prepare dataloader
# Prepare inputs tensors and dataloaders
dataset_tensors
=
pre_process_dataset
(
encoded_dataset
,
max_sub_part_length
,
*
special_tokens_ids
)
tensor_datasets
=
pre_process_datasets
(
encoded_datasets
,
max_sub_part_length
,
*
special_tokens_ids
)
train_data
=
TensorDataset
(
*
dataset_tensors
)
train_tensor_dataset
,
eval_tensor_dataset
=
tensor_datasets
[
0
],
tensor_datasets
[
1
]
train_data
=
TensorDataset
(
*
train_tensor_dataset
)
train_sampler
=
RandomSampler
(
train_data
)
train_sampler
=
RandomSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
eval_data
=
TensorDataset
(
*
eval_tensor_dataset
)
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
...
@@ -232,11 +157,10 @@ if __name__ == '__main__':
...
@@ -232,11 +157,10 @@ if __name__ == '__main__':
lr
=
args
.
learning_rate
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
warmup
=
args
.
warmup_proportion
,
max_grad_norm
=
args
.
max_grad_norm
,
max_grad_norm
=
args
.
max_grad_norm
,
weight_decay
=
ar
s
g
.
weight_decay
,
weight_decay
=
arg
s
.
weight_decay
,
t_total
=
num_train_optimization_steps
)
t_total
=
num_train_optimization_steps
)
if
args
.
do_train
:
if
args
.
do_train
:
global_step
=
0
nb_tr_steps
=
0
nb_tr_steps
=
0
tr_loss
=
0
tr_loss
=
0
model
.
train
()
model
.
train
()
...
@@ -249,6 +173,7 @@ if __name__ == '__main__':
...
@@ -249,6 +173,7 @@ if __name__ == '__main__':
losses
=
model
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
losses
=
model
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
)
loss
=
args
.
lm_coef
*
losses
[
0
]
+
losses
[
1
]
loss
=
args
.
lm_coef
*
losses
[
0
]
+
losses
[
1
]
loss
.
backward
()
loss
.
backward
()
optimizer
.
step
()
tr_loss
+=
loss
.
item
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
nb_tr_steps
+=
1
...
@@ -261,44 +186,26 @@ if __name__ == '__main__':
...
@@ -261,44 +186,26 @@ if __name__ == '__main__':
# Load a trained model that you have fine-tuned
# Load a trained model that you have fine-tuned
model_state_dict
=
torch
.
load
(
output_model_file
)
model_state_dict
=
torch
.
load
(
output_model_file
)
model
=
OpenAIGPTDoubleHeadsModel
(
args
.
mode
,
state_dict
=
model_state_dict
,
num_labels
=
num_labels
)
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
args
.
model_name
,
state_dict
=
model_state_dict
,
num_special_tokens
=
len
(
special_tokens
))
model
.
to
(
device
)
model
.
to
(
device
)
if
args
.
do_eval
:
if
args
.
do_eval
:
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
model
.
eval
()
eval_loss
,
eval_accuracy
=
0
,
0
eval_loss
,
eval_accuracy
=
0
,
0
nb_eval_steps
,
nb_eval_examples
=
0
,
0
nb_eval_steps
,
nb_eval_examples
=
0
,
0
for
batch
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
=
input_ids
.
to
(
device
)
input_ids
,
mc_token_mask
,
lm_labels
,
mc_labels
=
batch
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
with
torch
.
no_grad
():
tmp_eval
_loss
=
model
(
input_ids
,
segment_ids
,
input_mask
,
label
_id
s
)
_
,
mc
_loss
=
model
(
input_ids
,
mc_token_mask
,
lm_labels
,
mc_
labels
)
logits
=
model
(
input_ids
,
segment_ids
,
input
_mask
)
_
,
mc_
logits
=
model
(
input_ids
,
mc_token
_mask
)
logits
=
logits
.
detach
().
cpu
().
numpy
()
mc_
logits
=
mc_
logits
.
detach
().
cpu
().
numpy
()
label
_id
s
=
label
_id
s
.
to
(
'cpu'
).
numpy
()
mc_
labels
=
mc_
labels
.
to
(
'cpu'
).
numpy
()
tmp_eval_accuracy
=
accuracy
(
logits
,
label
_id
s
)
tmp_eval_accuracy
=
accuracy
(
mc_
logits
,
mc_
labels
)
eval_loss
+=
tmp_eval
_loss
.
mean
().
item
()
eval_loss
+=
mc
_loss
.
mean
().
item
()
eval_accuracy
+=
tmp_eval_accuracy
eval_accuracy
+=
tmp_eval_accuracy
nb_eval_examples
+=
input_ids
.
size
(
0
)
nb_eval_examples
+=
input_ids
.
size
(
0
)
...
@@ -306,11 +213,10 @@ if __name__ == '__main__':
...
@@ -306,11 +213,10 @@ if __name__ == '__main__':
eval_loss
=
eval_loss
/
nb_eval_steps
eval_loss
=
eval_loss
/
nb_eval_steps
eval_accuracy
=
eval_accuracy
/
nb_eval_examples
eval_accuracy
=
eval_accuracy
/
nb_eval_examples
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
train_
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
result
=
{
'eval_loss'
:
eval_loss
,
result
=
{
'eval_loss'
:
eval_loss
,
'eval_accuracy'
:
eval_accuracy
,
'eval_accuracy'
:
eval_accuracy
,
'global_step'
:
global_step
,
'train_loss'
:
train_loss
}
'loss'
:
loss
}
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
with
open
(
output_eval_file
,
"w"
)
as
writer
:
...
@@ -319,26 +225,5 @@ if __name__ == '__main__':
...
@@ -319,26 +225,5 @@ if __name__ == '__main__':
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"
__main__
"
:
if
__name__
==
'
__main__
'
:
main
()
main
()
n_updates
=
0
n_epochs
=
0
if
dataset
!=
'stsb'
:
trYt
=
trY
if
submit
:
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
torch
.
save
(
dh_model
.
state_dict
(),
make_path
(
path
))
best_score
=
0
for
i
in
range
(
args
.
n_iter
):
print
(
"running epoch"
,
i
)
run_epoch
()
n_epochs
+=
1
log
(
save_dir
,
desc
)
if
submit
:
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
dh_model
.
load_state_dict
(
torch
.
load
(
path
))
predict
(
dataset
,
args
.
submission_dir
)
if
args
.
analysis
:
rocstories_analysis
(
data_dir
,
os
.
path
.
join
(
args
.
submission_dir
,
'ROCStories.tsv'
),
os
.
path
.
join
(
log_dir
,
'rocstories.jsonl'
))
examples/run_transfo_xl.py
0 → 100644
View file @
6bc082da
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Transformer XL model evaluation script.
Adapted from https://github.com/kimiyoung/transformer-xl.
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
"""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
logging
import
time
import
math
import
torch
from
pytorch_pretrained_bert
import
TransfoXLModel
,
TransfoXLCorpus
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Transformer Language Model'
)
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'transfo-xl-wt103'
,
help
=
'pretrained model name'
)
parser
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'test'
,
choices
=
[
'all'
,
'valid'
,
'test'
],
help
=
'which split to evaluate'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
10
,
help
=
'batch size'
)
parser
.
add_argument
(
'--tgt_len'
,
type
=
int
,
default
=
128
,
help
=
'number of tokens to predict'
)
parser
.
add_argument
(
'--ext_len'
,
type
=
int
,
default
=
0
,
help
=
'length of the extended context'
)
parser
.
add_argument
(
'--mem_len'
,
type
=
int
,
default
=
1600
,
help
=
'length of the retained previous heads'
)
parser
.
add_argument
(
'--clamp_len'
,
type
=
int
,
default
=
1000
,
help
=
'max positional embedding index'
)
parser
.
add_argument
(
'--cuda'
,
action
=
'store_true'
,
help
=
'use CUDA'
)
parser
.
add_argument
(
'--work_dir'
,
type
=
str
,
required
=
True
,
help
=
'path to the work_dir'
)
parser
.
add_argument
(
'--no_log'
,
action
=
'store_true'
,
help
=
'do not log the eval result'
)
parser
.
add_argument
(
'--same_length'
,
action
=
'store_true'
,
help
=
'set same length attention with masking'
)
args
=
parser
.
parse_args
()
assert
args
.
ext_len
>=
0
,
'extended context length must be non-negative'
device
=
torch
.
device
(
"cuda"
if
args
.
cuda
else
"cpu"
)
# Load a pre-processed dataset
# You can also build the corpus yourself using TransfoXLCorpus methods
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
# and tokenizing the dataset
# The pre-processed corpus is a convertion (using the conversion script )
corpus
=
TransfoXLCorpus
.
from_pretrained
(
args
.
model_name
)
ntokens
=
len
(
corpus
.
vocab
)
va_iter
=
corpus
.
get_iterator
(
'valid'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
te_iter
=
corpus
.
get_iterator
(
'test'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
# Load a pre-trained model
model
=
TransfoXLModel
.
from_pretrained
(
args
.
model_name
)
model
=
model
.
to
(
device
)
logger
.
info
(
'Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'
.
format
(
args
.
batch_size
,
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
,
args
.
clamp_len
))
model
.
reset_length
(
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
)
if
args
.
clamp_len
>
0
:
model
.
clamp_len
=
args
.
clamp_len
if
args
.
same_length
:
model
.
same_length
=
True
###############################################################################
# Evaluation code
###############################################################################
def
evaluate
(
eval_iter
):
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_len
,
total_loss
=
0
,
0.
start_time
=
time
.
time
()
with
torch
.
no_grad
():
mems
=
tuple
()
for
idx
,
(
data
,
target
,
seq_len
)
in
enumerate
(
eval_iter
):
ret
=
model
(
data
,
target
,
*
mems
)
loss
,
mems
=
ret
loss
=
loss
.
mean
()
total_loss
+=
seq_len
*
loss
.
item
()
total_len
+=
seq_len
total_time
=
time
.
time
()
-
start_time
logger
.
info
(
'Time : {:.2f}s, {:.2f}ms/segment'
.
format
(
total_time
,
1000
*
total_time
/
(
idx
+
1
)))
return
total_loss
/
total_len
# Run on test data.
if
args
.
split
==
'all'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
evaluate
(
va_iter
)
elif
args
.
split
==
'valid'
:
valid_loss
=
evaluate
(
va_iter
)
test_loss
=
None
elif
args
.
split
==
'test'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
None
def
format_log
(
loss
,
split
):
log_str
=
'| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '
.
format
(
split
,
loss
,
math
.
exp
(
loss
))
return
log_str
log_str
=
''
if
valid_loss
is
not
None
:
log_str
+=
format_log
(
valid_loss
,
'valid'
)
if
test_loss
is
not
None
:
log_str
+=
format_log
(
test_loss
,
'test'
)
logger
.
info
(
'='
*
100
)
logger
.
info
(
log_str
)
logger
.
info
(
'='
*
100
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
examples/train_transfo_xl.py
deleted
100644 → 0
View file @
eb8fda51
This diff is collapsed.
Click to expand it.
examples/transfo_xl_eval.py
deleted
100644 → 0
View file @
eb8fda51
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Transformer XL model evaluation script.
Adapted from https://github.com/kimiyoung/transformer-xl.
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
"""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
logging
import
time
import
math
import
torch
from
pytorch_pretrained_bert
import
TransfoXLModel
,
TransfoXLCorpus
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Transformer Language Model'
)
parser
.
add_argument
(
'--model_name'
,
type
=
str
,
default
=
'transfo-xl-wt103'
,
help
=
'pretrained model name'
)
parser
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'test'
,
choices
=
[
'all'
,
'valid'
,
'test'
],
help
=
'which split to evaluate'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
10
,
help
=
'batch size'
)
parser
.
add_argument
(
'--tgt_len'
,
type
=
int
,
default
=
128
,
help
=
'number of tokens to predict'
)
parser
.
add_argument
(
'--ext_len'
,
type
=
int
,
default
=
0
,
help
=
'length of the extended context'
)
parser
.
add_argument
(
'--mem_len'
,
type
=
int
,
default
=
1600
,
help
=
'length of the retained previous heads'
)
parser
.
add_argument
(
'--clamp_len'
,
type
=
int
,
default
=
1000
,
help
=
'max positional embedding index'
)
parser
.
add_argument
(
'--cuda'
,
action
=
'store_true'
,
help
=
'use CUDA'
)
parser
.
add_argument
(
'--work_dir'
,
type
=
str
,
required
=
True
,
help
=
'path to the work_dir'
)
parser
.
add_argument
(
'--no_log'
,
action
=
'store_true'
,
help
=
'do not log the eval result'
)
parser
.
add_argument
(
'--same_length'
,
action
=
'store_true'
,
help
=
'set same length attention with masking'
)
args
=
parser
.
parse_args
()
assert
args
.
ext_len
>=
0
,
'extended context length must be non-negative'
device
=
torch
.
device
(
"cuda"
if
args
.
cuda
else
"cpu"
)
# Load a pre-processed dataset
# You can also build the corpus yourself using TransfoXLCorpus methods
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
# and tokenizing the dataset
# The pre-processed corpus is a convertion (using the conversion script )
corpus
=
TransfoXLCorpus
.
from_pretrained
(
args
.
model_name
)
ntokens
=
len
(
corpus
.
vocab
)
va_iter
=
corpus
.
get_iterator
(
'valid'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
te_iter
=
corpus
.
get_iterator
(
'test'
,
args
.
batch_size
,
args
.
tgt_len
,
device
=
device
,
ext_len
=
args
.
ext_len
)
# Load a pre-trained model
model
=
TransfoXLModel
.
from_pretrained
(
args
.
model_name
)
model
=
model
.
to
(
device
)
logger
.
info
(
'Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'
.
format
(
args
.
batch_size
,
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
,
args
.
clamp_len
))
model
.
reset_length
(
args
.
tgt_len
,
args
.
ext_len
,
args
.
mem_len
)
if
args
.
clamp_len
>
0
:
model
.
clamp_len
=
args
.
clamp_len
if
args
.
same_length
:
model
.
same_length
=
True
###############################################################################
# Evaluation code
###############################################################################
def
evaluate
(
eval_iter
):
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_len
,
total_loss
=
0
,
0.
start_time
=
time
.
time
()
with
torch
.
no_grad
():
mems
=
tuple
()
for
idx
,
(
data
,
target
,
seq_len
)
in
enumerate
(
eval_iter
):
ret
=
model
(
data
,
target
,
*
mems
)
loss
,
mems
=
ret
loss
=
loss
.
mean
()
total_loss
+=
seq_len
*
loss
.
item
()
total_len
+=
seq_len
total_time
=
time
.
time
()
-
start_time
logger
.
info
(
'Time : {:.2f}s, {:.2f}ms/segment'
.
format
(
total_time
,
1000
*
total_time
/
(
idx
+
1
)))
return
total_loss
/
total_len
# Run on test data.
if
args
.
split
==
'all'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
evaluate
(
va_iter
)
elif
args
.
split
==
'valid'
:
valid_loss
=
evaluate
(
va_iter
)
test_loss
=
None
elif
args
.
split
==
'test'
:
test_loss
=
evaluate
(
te_iter
)
valid_loss
=
None
def
format_log
(
loss
,
split
):
log_str
=
'| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '
.
format
(
split
,
loss
,
math
.
exp
(
loss
))
return
log_str
log_str
=
''
if
valid_loss
is
not
None
:
log_str
+=
format_log
(
valid_loss
,
'valid'
)
if
test_loss
is
not
None
:
log_str
+=
format_log
(
test_loss
,
'test'
)
logger
.
info
(
'='
*
100
)
logger
.
info
(
log_str
)
logger
.
info
(
'='
*
100
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment