Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
63c45056
"tests/models/bart/test_modeling_bart.py" did not exist on "a75c64d80c76c3dc71f735d9197a4a601847e0cd"
Commit
63c45056
authored
Dec 06, 2018
by
Grégory Châtel
Browse files
Finishing the code for the Swag task.
parent
fc5a38ac
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
342 additions
and
18 deletions
+342
-18
examples/run_swag.py
examples/run_swag.py
+342
-18
No files found.
examples/run_swag.py
View file @
63c45056
...
...
@@ -17,8 +17,20 @@
import
pandas
as
pd
import
logging
import
os
import
argparse
import
random
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
BertForMultipleChoice
from
pytorch_pretrained_bert.optimization
import
BertAdam
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
...
...
@@ -86,6 +98,7 @@ class InputFeatures(object):
]
self
.
label
=
label
def
read_swag_examples
(
input_file
,
is_training
):
input_df
=
pd
.
read_csv
(
input_file
)
...
...
@@ -110,7 +123,6 @@ def read_swag_examples(input_file, is_training):
return
examples
def
convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
is_training
):
"""Loads a data file into a list of `InputBatch`s."""
...
...
@@ -189,7 +201,6 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
return
features
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
...
...
@@ -206,21 +217,334 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
else
:
tokens_b
.
pop
()
def
accuracy
(
out
,
labels
):
outputs
=
np
.
argmax
(
out
,
axis
=
1
)
return
np
.
sum
(
outputs
==
labels
)
def
select_field
(
features
,
field
):
return
[
[
choice
[
field
]
for
choice
in
feature
.
choices_features
]
for
feature
in
features
]
def
copy_optimizer_params_to_model
(
named_params_model
,
named_params_optimizer
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the parameters optimized on CPU/RAM back to the model on GPU
"""
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
param_model
.
data
.
copy_
(
param_opti
.
data
)
def
set_optimizer_params_grad
(
named_params_optimizer
,
named_params_model
,
test_nan
=
False
):
""" Utility function for optimize_on_cpu and 16-bits training.
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
"""
is_nan
=
False
for
(
name_opti
,
param_opti
),
(
name_model
,
param_model
)
in
zip
(
named_params_optimizer
,
named_params_model
):
if
name_opti
!=
name_model
:
logger
.
error
(
"name_opti != name_model: {} {}"
.
format
(
name_opti
,
name_model
))
raise
ValueError
if
param_model
.
grad
is
not
None
:
if
test_nan
and
torch
.
isnan
(
param_model
.
grad
).
sum
()
>
0
:
is_nan
=
True
if
param_opti
.
grad
is
None
:
param_opti
.
grad
=
torch
.
nn
.
Parameter
(
param_opti
.
data
.
new
().
resize_
(
*
param_opti
.
data
.
size
()))
param_opti
.
grad
.
data
.
copy_
(
param_model
.
grad
.
data
)
else
:
param_opti
.
grad
=
None
return
is_nan
def
main
():
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .csv files (or other data files) for the task."
)
parser
.
add_argument
(
"--bert_model"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model checkpoints will be written."
)
## Other parameters
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--do_train"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--no_cuda"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--optimize_on_cpu'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to perform optimization and keep the optimizer averages on CPU"
)
parser
.
add_argument
(
'--fp16'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
128
,
help
=
'Loss scaling, positive power of 2 values can improve fp16 convergence.'
)
args
=
parser
.
parse_args
()
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
if
args
.
fp16
:
logger
.
info
(
"16-bits training currently not supported in distributed training"
)
args
.
fp16
=
False
# (see https://github.com/pytorch/pytorch/pull/13496)
logger
.
info
(
"device %s n_gpu %d distributed training %r"
,
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
int
(
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
):
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
# task_name = args.task_name.lower()
# if task_name not in processors:
# raise ValueError("Task not found: %s" % (task_name))
# processor = processors[task_name]()
# label_list = processor.get_labels()
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
train_examples
=
None
num_train_steps
=
None
if
args
.
do_train
:
train_examples
=
read_swag_examples
(
os
.
path
.
join
(
args
.
data_dir
,
'train.csv'
),
is_training
=
True
)
num_train_steps
=
int
(
len
(
train_examples
)
/
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
)
# Prepare model
model
=
BertForMultipleChoice
.
from_pretrained
(
args
.
bert_model
,
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
/
'distributed_{}'
.
format
(
args
.
local_rank
),
num_choices
=
4
)
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
if
args
.
fp16
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
float
().
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
elif
args
.
optimize_on_cpu
:
param_optimizer
=
[(
n
,
param
.
clone
().
detach
().
to
(
'cpu'
).
requires_grad_
())
\
for
n
,
param
in
model
.
named_parameters
()]
else
:
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'gamma'
,
'beta'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay_rate'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay_rate'
:
0.0
}
]
t_total
=
num_train_steps
if
args
.
local_rank
!=
-
1
:
t_total
=
t_total
//
torch
.
distributed
.
get_world_size
()
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
t_total
)
global_step
=
0
if
args
.
do_train
:
train_features
=
convert_examples_to_features
(
train_examples
,
tokenizer
,
args
.
max_seq_length
,
True
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_steps
)
all_input_ids
=
torch
.
tensor
(
select_field
(
train_features
,
'input_ids'
),
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
select_field
(
train_features
,
'input_mask'
),
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
select_field
(
train_features
,
'segment_ids'
),
dtype
=
torch
.
long
)
all_label
=
torch
.
tensor
([
f
.
label
for
f
in
train_features
],
dtype
=
torch
.
long
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
RandomSampler
(
train_data
)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
model
.
train
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
):
tr_loss
=
0
nb_tr_examples
,
nb_tr_steps
=
0
,
0
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
)):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
loss
=
model
(
input_ids
,
segment_ids
,
input_mask
,
label_ids
)
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
fp16
and
args
.
loss_scale
!=
1.0
:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss
=
loss
*
args
.
loss_scale
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
or
args
.
optimize_on_cpu
:
if
args
.
fp16
and
args
.
loss_scale
!=
1.0
:
# scale down gradients for fp16 training
for
param
in
model
.
parameters
():
if
param
.
grad
is
not
None
:
param
.
grad
.
data
=
param
.
grad
.
data
/
args
.
loss_scale
is_nan
=
set_optimizer_params_grad
(
param_optimizer
,
model
.
named_parameters
(),
test_nan
=
True
)
if
is_nan
:
logger
.
info
(
"FP16 TRAINING: Nan in gradients, reducing loss scaling"
)
args
.
loss_scale
=
args
.
loss_scale
/
2
model
.
zero_grad
()
continue
optimizer
.
step
()
copy_optimizer_params_to_model
(
model
.
named_parameters
(),
param_optimizer
)
else
:
optimizer
.
step
()
model
.
zero_grad
()
global_step
+=
1
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
read_swag_examples
(
os
.
path
.
join
(
args
.
data_dir
,
'val.csv'
),
is_training
=
True
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
tokenizer
,
args
.
max_seq_length
,
True
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
(
select_field
(
eval_features
,
'input_ids'
),
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
select_field
(
eval_features
,
'input_mask'
),
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
select_field
(
eval_features
,
'segment_ids'
),
dtype
=
torch
.
long
)
all_label
=
torch
.
tensor
([
f
.
label
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
,
eval_accuracy
=
0
,
0
nb_eval_steps
,
nb_eval_examples
=
0
,
0
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
eval_dataloader
:
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
tmp_eval_loss
=
model
(
input_ids
,
segment_ids
,
input_mask
,
label_ids
)
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
)
logits
=
logits
.
detach
().
cpu
().
numpy
()
label_ids
=
label_ids
.
to
(
'cpu'
).
numpy
()
tmp_eval_accuracy
=
accuracy
(
logits
,
label_ids
)
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
eval_accuracy
+=
tmp_eval_accuracy
nb_eval_examples
+=
input_ids
.
size
(
0
)
nb_eval_steps
+=
1
eval_loss
=
eval_loss
/
nb_eval_steps
eval_accuracy
=
eval_accuracy
/
nb_eval_examples
result
=
{
'eval_loss'
:
eval_loss
,
'eval_accuracy'
:
eval_accuracy
,
'global_step'
:
global_step
,
'loss'
:
tr_loss
/
nb_tr_steps
}
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
is_training
=
True
max_seq_length
=
80
examples
=
read_swag_examples
(
'data/train.csv'
,
is_training
)
print
(
len
(
examples
))
for
example
in
examples
[:
5
]:
print
(
"###########################"
)
print
(
example
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
"bert-base-uncased"
)
features
=
convert_examples_to_features
(
examples
[:
500
],
tokenizer
,
max_seq_length
,
is_training
)
for
i
in
range
(
10
):
choice_feature_list
=
features
[
i
].
choices_features
for
choice_idx
,
choice_feature
in
enumerate
(
choice_feature_list
):
print
(
f
'choice_idx:
{
choice_idx
}
'
)
print
(
f
'input_ids:
{
" "
.
join
(
map
(
str
,
choice_feature
[
"input_ids"
]))
}
'
)
print
(
f
'input_mask:
{
" "
.
join
(
map
(
str
,
choice_feature
[
"input_mask"
]))
}
'
)
print
(
f
'segment_ids:
{
" "
.
join
(
map
(
str
,
choice_feature
[
"segment_ids"
]))
}
'
)
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment