Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
63c45056
Commit
63c45056
authored
Dec 06, 2018
by
Grégory Châtel
Browse files
Finishing the code for the Swag task.
parent
fc5a38ac
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
342 additions
and
18 deletions
+342
-18
examples/run_swag.py
examples/run_swag.py
+342
-18
No files found.
examples/run_swag.py
View file @
63c45056
...
@@ -17,8 +17,20 @@
...
@@ -17,8 +17,20 @@
import
pandas
as
pd
import
pandas
as
pd
import
logging
import
logging
import
os
import
argparse
import
random
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
BertForMultipleChoice
from
pytorch_pretrained_bert.optimization
import
BertAdam
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
...
@@ -86,6 +98,7 @@ class InputFeatures(object):
...
@@ -86,6 +98,7 @@ class InputFeatures(object):
]
]
self
.
label
=
label
self
.
label
=
label
def
read_swag_examples
(
input_file
,
is_training
):
def
read_swag_examples
(
input_file
,
is_training
):
input_df
=
pd
.
read_csv
(
input_file
)
input_df
=
pd
.
read_csv
(
input_file
)
...
@@ -110,7 +123,6 @@ def read_swag_examples(input_file, is_training):
...
@@ -110,7 +123,6 @@ def read_swag_examples(input_file, is_training):
return
examples
return
examples
def
convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
def
convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
is_training
):
is_training
):
"""Loads a data file into a list of `InputBatch`s."""
"""Loads a data file into a list of `InputBatch`s."""
...
@@ -189,7 +201,6 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
...
@@ -189,7 +201,6 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
return
features
return
features
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
"""Truncates a sequence pair in place to the maximum length."""
...
@@ -206,21 +217,334 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
...
@@ -206,21 +217,334 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
else
:
else
:
tokens_b
.
pop
()
tokens_b
.
pop
()
def
accuracy
(
out
,
labels
):
outputs
=
np
.
argmax
(
out
,
axis
=
1
)
return
np
.
sum
(
outputs
==
labels
)
def
select_field
(
features
,
field
):
return
[
[
choice
[
field
]
for
choice
in
feature
.
choices_features
]
for
feature
in
features
]
def
copy_optimizer_params_to_model
(
named_params_model
,
named_params_optimizer
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the parameters optimized on CPU/RAM back to the model on GPU
"""
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
param_model
.
data
.
copy_
(
param_opti
.
data
)
def
set_optimizer_params_grad
(
named_params_optimizer
,
named_params_model
,
test_nan
=
False
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
"""
is_nan
=
False
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
if
param_model
.
grad
is
not
None
:
if
test_nan
and
torch
.
isnan
(
param_model
.
grad
).
sum
()
>
0
:
is_nan
=
True
if
param_opti
.
grad
is
None
:
param_opti
.
grad
=
torch
.
nn
.
Parameter
(
param_opti
.
data
.
new
().
resize_
(
*
param_opti
.
data
.
size
()))
param_opti
.
grad
.
data
.
copy_
(
param_model
.
grad
.
data
)
else
:
param_opti
.
grad
=
None
return
is_nan
def
main
():
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .csv files (or other data files) for the task."
)
parser
.
add_argument
(
"--bert_model"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model checkpoints will be written."
)
## Other parameters
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--do_train"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--no_cuda"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--optimize_on_cpu'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to perform optimization and keep the optimizer averages on CPU"
)
parser
.
add_argument
(
'--fp16'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
128
,
help
=
'Loss scaling, positive power of 2 values can improve fp16 convergence.'
)
args
=
parser
.
parse_args
()
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
if
args
.
fp16
:
logger
.
info
(
"16-bits training currently not supported in distributed training"
)
args
.
fp16
=
False
# (see https://github.com/pytorch/pytorch/pull/13496)
logger
.
info
(
"device %s n_gpu %d distributed training %r"
,
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
int
(
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
):
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
# task_name = args.task_name.lower()
# if task_name not in processors:
# raise ValueError("Task not found: %s" % (task_name))
# processor = processors[task_name]()
# label_list = processor.get_labels()
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
train_examples
=
None
num_train_steps
=
None
if
args
.
do_train
:
train_examples
=
read_swag_examples
(
os
.
path
.
join
(
args
.
data_dir
,
'train.csv'
),
is_training
=
True
)
num_train_steps
=
int
(
len
(
train_examples
)
/
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
)
# Prepare model
model
=
BertForMultipleChoice
.
from_pretrained
(
args
.
bert_model
,
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
/
'distributed_{}'
.
format
(
args
.
local_rank
),
num_choices
=
4
)
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
if
args
.
fp16
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
float
().
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
elif
args
.
optimize_on_cpu
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
else
:
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay_rate'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay_rate'
:
0.0
}
]
t_total
=
num_train_steps
if
args
.
local_rank
!=
-
1
:
t_total
=
t_total
//
torch
.
distributed
.
get_world_size
()
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
t_total
)
global_step
=
0
if
args
.
do_train
:
train_features
=
convert_examples_to_features
(
train_examples
,
tokenizer
,
args
.
max_seq_length
,
True
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_steps
)
all_input_ids
=
torch
.
tensor
(
select_field
(
train_features
,
'input_ids'
),
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
select_field
(
train_features
,
'input_mask'
),
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
select_field
(
train_features
,
'segment_ids'
),
dtype
=
torch
.
long
)
all_label
=
torch
.
tensor
([
f
.
label
for
f
in
train_features
],
dtype
=
torch
.
long
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
RandomSampler
(
train_data
)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
model
.
train
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
):
tr_loss
=
0
nb_tr_examples
,
nb_tr_steps
=
0
,
0
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
)):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
loss
=
model
(
input_ids
,
segment_ids
,
input_mask
,
label_ids
)
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
fp16
and
args
.
loss_scale
!=
1.0
:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss
=
loss
*
args
.
loss_scale
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
or
args
.
optimize_on_cpu
:
if
args
.
fp16
and
args
.
loss_scale
!=
1.0
:
# scale down gradients for fp16 training
for
param
in
model
.
parameters
():
if
param
.
grad
is
not
None
:
param
.
grad
.
data
=
param
.
grad
.
data
/
args
.
loss_scale
is_nan
=
set_optimizer_params_grad
(
param_optimizer
,
model
.
named_parameters
(),
test_nan
=
True
)
if
is_nan
:
logger
.
info
(
"FP16 TRAINING: Nan in gradients, reducing loss scaling"
)
args
.
loss_scale
=
args
.
loss_scale
/
2
model
.
zero_grad
()
continue
optimizer
.
step
()
copy_optimizer_params_to_model
(
model
.
named_parameters
(),
param_optimizer
)
else
:
optimizer
.
step
()
model
.
zero_grad
()
global_step
+=
1
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
read_swag_examples
(
os
.
path
.
join
(
args
.
data_dir
,
'val.csv'
),
is_training
=
True
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
tokenizer
,
args
.
max_seq_length
,
True
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
(
select_field
(
eval_features
,
'input_ids'
),
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
select_field
(
eval_features
,
'input_mask'
),
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
select_field
(
eval_features
,
'segment_ids'
),
dtype
=
torch
.
long
)
all_label
=
torch
.
tensor
([
f
.
label
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
,
eval_accuracy
=
0
,
0
nb_eval_steps
,
nb_eval_examples
=
0
,
0
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
eval_dataloader
:
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
tmp_eval_loss
=
model
(
input_ids
,
segment_ids
,
input_mask
,
label_ids
)
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
)
logits
=
logits
.
detach
().
cpu
().
numpy
()
label_ids
=
label_ids
.
to
(
'cpu'
).
numpy
()
tmp_eval_accuracy
=
accuracy
(
logits
,
label_ids
)
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
eval_accuracy
+=
tmp_eval_accuracy
nb_eval_examples
+=
input_ids
.
size
(
0
)
nb_eval_steps
+=
1
eval_loss
=
eval_loss
/
nb_eval_steps
eval_accuracy
=
eval_accuracy
/
nb_eval_examples
result
=
{
'eval_loss'
:
eval_loss
,
'eval_accuracy'
:
eval_accuracy
,
'global_step'
:
global_step
,
'loss'
:
tr_loss
/
nb_tr_steps
}
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
is_training
=
True
main
()
max_seq_length
=
80
examples
=
read_swag_examples
(
'data/train.csv'
,
is_training
)
print
(
len
(
examples
))
for
example
in
examples
[:
5
]:
print
(
"###########################"
)
print
(
example
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
"bert-base-uncased"
)
features
=
convert_examples_to_features
(
examples
[:
500
],
tokenizer
,
max_seq_length
,
is_training
)
for
i
in
range
(
10
):
choice_feature_list
=
features
[
i
].
choices_features
for
choice_idx
,
choice_feature
in
enumerate
(
choice_feature_list
):
print
(
f
'choice_idx:
{
choice_idx
}
'
)
print
(
f
'input_ids:
{
" "
.
join
(
map
(
str
,
choice_feature
[
"input_ids"
]))
}
'
)
print
(
f
'input_mask:
{
" "
.
join
(
map
(
str
,
choice_feature
[
"input_mask"
]))
}
'
)
print
(
f
'segment_ids:
{
" "
.
join
(
map
(
str
,
choice_feature
[
"segment_ids"
]))
}
'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment