Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
b21d84b0
"...runtime/git@developer.sourcefind.cn:change/sglang.git" did not exist on "66e7dcaf7008d2ffe892044a21513a6e06424d1a"
Commit
b21d84b0
authored
Jul 11, 2019
by
thomwolf
Browse files
update examples
parent
ec07cf5a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
9 additions
and
1124 deletions
+9
-1124
examples/run_bert_classifier.py
examples/run_bert_classifier.py
+0
-528
examples/run_glue.py
examples/run_glue.py
+7
-3
examples/run_xlnet_classifier.py
examples/run_xlnet_classifier.py
+0
-530
examples/utils.py
examples/utils.py
+0
-61
pytorch_transformers/optimization.py
pytorch_transformers/optimization.py
+2
-2
No files found.
examples/run_bert_classifier.py
deleted
100644 → 0
View file @
ec07cf5a
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from
__future__
import
absolute_import
,
division
,
print_function
import
argparse
import
logging
import
os
import
sys
import
random
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
from
torch.utils.data.distributed
import
DistributedSampler
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
tensorboardX
import
SummaryWriter
from
pytorch_transformers
import
WEIGHTS_NAME
,
CONFIG_NAME
from
pytorch_transformers.modeling_bert
import
BertForSequenceClassification
from
pytorch_transformers.tokenization_bert
import
BertTokenizer
from
pytorch_transformers.optimization
import
BertAdam
,
WarmupLinearSchedule
from
utils_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
else
:
import
pickle
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--bert_model"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese."
)
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
## Other parameters
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
args
=
parser
.
parse_args
()
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
args
.
device
=
device
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
args
.
output_dir
)
task_name
=
args
.
task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
,
num_labels
=
num_labels
)
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
global_step
=
0
nb_tr_steps
=
0
tr_loss
=
0
if
args
.
do_train
:
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
=
SummaryWriter
()
# Prepare data loader
train_examples
=
processor
.
get_train_examples
(
args
.
data_dir
)
cached_train_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'train_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
bert_model
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
try
:
with
open
(
cached_train_features_file
,
"rb"
)
as
reader
:
train_features
=
pickle
.
load
(
reader
)
except
:
train_features
=
convert_examples_to_features
(
train_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
if
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
:
logger
.
info
(
" Saving train features into cached file %s"
,
cached_train_features_file
)
with
open
(
cached_train_features_file
,
"wb"
)
as
writer
:
pickle
.
dump
(
train_features
,
writer
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
float
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
RandomSampler
(
train_data
)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
num_train_optimization_steps
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
if
args
.
fp16
:
try
:
from
apex.optimizers
import
FP16_Optimizer
from
apex.optimizers
import
FusedAdam
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_optimization_steps
)
model
.
train
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
tr_loss
=
0
nb_tr_examples
,
nb_tr_steps
=
0
,
0
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# define a new function to compute loss values for both output_modes
ouputs
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
,
labels
=
label_ids
)
loss
=
ouputs
[
0
]
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
zero_grad
()
global_step
+=
1
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
.
add_scalar
(
'lr'
,
optimizer
.
get_lr
()[
0
],
global_step
)
tb_writer
.
add_scalar
(
'loss'
,
loss
.
item
(),
global_step
)
### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
### Example:
if
args
.
do_train
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
# Save a trained model, configuration and tokenizer
model_to_save
=
model
.
module
if
hasattr
(
model
,
'module'
)
else
model
# Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file
=
os
.
path
.
join
(
args
.
output_dir
,
WEIGHTS_NAME
)
output_config_file
=
os
.
path
.
join
(
args
.
output_dir
,
CONFIG_NAME
)
torch
.
save
(
model_to_save
.
state_dict
(),
output_model_file
)
model_to_save
.
config
.
to_json_file
(
output_config_file
)
tokenizer
.
save_vocabulary
(
args
.
output_dir
)
# Load a trained model and vocabulary that you have fine-tuned
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
output_dir
,
num_labels
=
num_labels
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
output_dir
,
do_lower_case
=
args
.
do_lower_case
)
# Good practice: save your training arguments together with the trained model
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
torch
.
save
(
args
,
output_args_file
)
else
:
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
,
num_labels
=
num_labels
)
model
.
to
(
device
)
### Evaluation
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
cached_eval_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'dev_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
bert_model
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
try
:
with
open
(
cached_eval_features_file
,
"rb"
)
as
reader
:
eval_features
=
pickle
.
load
(
reader
)
except
:
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
if
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
:
logger
.
info
(
" Saving eval features into cached file %s"
,
cached_eval_features_file
)
with
open
(
cached_eval_features_file
,
"wb"
)
as
writer
:
pickle
.
dump
(
eval_features
,
writer
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
if
args
.
local_rank
==
-
1
:
eval_sampler
=
SequentialSampler
(
eval_data
)
else
:
eval_sampler
=
DistributedSampler
(
eval_data
)
# Note that this sampler samples randomly
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
out_label_ids
=
None
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
outputs
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
,
labels
=
label_ids
)
tmp_eval_loss
,
logits
=
outputs
[:
2
]
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
out_label_ids
=
label_ids
.
detach
().
cpu
().
numpy
()
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
label_ids
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
if
output_mode
==
"classification"
:
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
elif
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
task_name
,
preds
,
out_label_ids
)
loss
=
tr_loss
/
global_step
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
# hack for MNLI-MM
if
task_name
==
"mnli"
:
task_name
=
"mnli-mm"
processor
=
processors
[
task_name
]()
if
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
)
and
os
.
listdir
(
args
.
output_dir
+
'-MM'
)
and
args
.
do_train
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
):
os
.
makedirs
(
args
.
output_dir
+
'-MM'
)
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
out_label_ids
=
None
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
,
labels
=
None
)
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
out_label_ids
=
label_ids
.
detach
().
cpu
().
numpy
()
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
label_ids
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
result
=
compute_metrics
(
task_name
,
preds
,
out_label_ids
)
loss
=
tr_loss
/
global_step
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
+
'-MM'
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
main
()
examples/run_glue.py
View file @
b21d84b0
...
@@ -74,8 +74,8 @@ def train(args, train_dataset, model, tokenizer):
...
@@ -74,8 +74,8 @@ def train(args, train_dataset, model, tokenizer):
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
args
.
weight_decay
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
args
.
weight_decay
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
]
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
)
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
eps
=
args
.
adam_epsilon
)
schedule
=
WarmupLinearSchedule
(
optimizer
,
warmup_steps
=
args
.
warmup_steps
,
t_total
=
t_total
)
schedule
r
=
WarmupLinearSchedule
(
optimizer
,
warmup_steps
=
args
.
warmup_steps
,
t_total
=
t_total
)
if
args
.
fp16
:
if
args
.
fp16
:
try
:
try
:
from
apex
import
amp
from
apex
import
amp
...
@@ -300,6 +300,8 @@ def main():
...
@@ -300,6 +300,8 @@ def main():
help
=
"The initial learning rate for Adam."
)
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--weight_decay"
,
default
=
0.0
,
type
=
float
,
parser
.
add_argument
(
"--weight_decay"
,
default
=
0.0
,
type
=
float
,
help
=
"Weight deay if we apply some."
)
help
=
"Weight deay if we apply some."
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
help
=
"Max gradient norm."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
...
@@ -358,7 +360,9 @@ def main():
...
@@ -358,7 +360,9 @@ def main():
args
.
device
=
device
args
.
device
=
device
# Setup logging
# Setup logging
logging
.
basicConfig
(
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
warning
(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s"
,
logger
.
warning
(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s"
,
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
)
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
)
...
...
examples/run_xlnet_classifier.py
deleted
100644 → 0
View file @
ec07cf5a
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from
__future__
import
absolute_import
,
division
,
print_function
import
argparse
import
logging
import
os
import
sys
import
random
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
from
torch.utils.data.distributed
import
DistributedSampler
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
tensorboardX
import
SummaryWriter
from
pytorch_transformers
import
WEIGHTS_NAME
,
CONFIG_NAME
from
pytorch_transformers.modeling_xlnet
import
XLNetForSequenceClassification
from
pytorch_transformers.tokenization_xlnet
import
XLNetTokenizer
from
pytorch_transformers.optimization
import
BertAdam
,
WarmupLinearSchedule
from
utils_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
else
:
import
pickle
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
# training
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--max_steps"
,
default
=-
1
,
type
=
int
,
help
=
"If > 0 limit the number of training steps to perform, you should choose only one of num_train_epochs and max_steps."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--clip_gradients"
,
default
=
1.0
,
type
=
float
,
help
=
"Clip gradient norms."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
"--log_every"
,
default
=
10
,
type
=
int
,
help
=
"Log metrics every X training steps."
)
# evaluation
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
# Model
parser
.
add_argument
(
"--xlnet_model"
,
default
=
"xlnet-large-cased"
,
type
=
str
,
help
=
"XLNet pre-trained model: currently only xlnet-large-cased."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
)
# task specific
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
help
=
"Overwrite the content of the output directory"
)
# Misc
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
args
=
parser
.
parse_args
()
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
args
.
device
=
device
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
args
.
output_dir
)
task_name
=
args
.
task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
args
.
xlnet_model
,
do_lower_case
=
args
.
do_lower_case
)
model
=
XLNetForSequenceClassification
.
from_pretrained
(
args
.
xlnet_model
,
num_labels
=
num_labels
)
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
global_step
=
0
curr_tr_loss
,
curr_steps
=
0.
,
1
if
args
.
do_train
:
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
=
SummaryWriter
()
# Prepare data loader
train_examples
=
processor
.
get_train_examples
(
args
.
data_dir
)
cached_train_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'train_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
xlnet_model
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
if
os
.
path
.
exists
(
cached_train_features_file
):
logger
.
info
(
"Loading train features for cache file %s"
,
cached_train_features_file
)
with
open
(
cached_train_features_file
,
"rb"
)
as
reader
:
train_features
=
pickle
.
load
(
reader
)
else
:
logger
.
info
(
"No cache file at %s, preparing train features"
,
cached_train_features_file
)
train_features
=
convert_examples_to_features
(
train_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
,
cls_token_at_end
=
True
,
cls_token
=
tokenizer
.
cls_token
,
sep_token
=
tokenizer
.
sep_token
,
cls_token_segment_id
=
2
,
pad_on_left
=
True
,
pad_token_segment_id
=
4
)
if
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
:
logger
.
info
(
" Saving train features into cached file %s"
,
cached_train_features_file
)
with
open
(
cached_train_features_file
,
"wb"
)
as
writer
:
pickle
.
dump
(
train_features
,
writer
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
float
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
SequentialSampler
(
train_data
)
# RandomSampler(train_data)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
if
args
.
max_steps
>
0
:
num_train_optimization_steps
=
args
.
max_steps
else
:
num_train_optimization_steps
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
# Prepare optimizer
optimizer_grouped_parameters
=
model
.
parameters
()
# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
# {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
# {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
if
args
.
fp16
:
try
:
from
apex.optimizers
import
FP16_Optimizer
from
apex.optimizers
import
FusedAdam
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_optimization_steps
)
model
.
train
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
)
if
args
.
max_steps
<=
0
else
int
(
'Inf'
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# define a new function to compute loss values for both output_modes
loss
,
_
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
,
labels
=
label_ids
)
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
gnorm
=
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
args
.
clip_gradients
)
curr_tr_loss
+=
loss
.
item
()
curr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
zero_grad
()
global_step
+=
1
if
args
.
local_rank
in
[
-
1
,
0
]
and
(
args
.
log_every
<=
0
or
(
global_step
+
1
)
%
args
.
log_every
==
0
):
learning_rate
=
optimizer
.
get_lr
()[
0
]
if
not
args
.
fp16
else
lr_this_step
logger
.
info
(
"[{}] | gnorm {:.2f} lr {:8.6f} | loss {:.2f}"
.
format
(
global_step
,
gnorm
,
learning_rate
,
curr_tr_loss
/
curr_steps
))
tb_writer
.
add_scalar
(
'lr'
,
learning_rate
,
global_step
)
tb_writer
.
add_scalar
(
'loss'
,
curr_tr_loss
/
curr_steps
,
global_step
)
curr_tr_loss
,
curr_steps
=
0.
,
1
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
break
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
break
### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
### Example:
if
args
.
do_train
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
# Save a trained model, configuration and tokenizer
model_to_save
=
model
.
module
if
hasattr
(
model
,
'module'
)
else
model
# Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file
=
os
.
path
.
join
(
args
.
output_dir
,
WEIGHTS_NAME
)
output_config_file
=
os
.
path
.
join
(
args
.
output_dir
,
CONFIG_NAME
)
torch
.
save
(
model_to_save
.
state_dict
(),
output_model_file
)
model_to_save
.
config
.
to_json_file
(
output_config_file
)
tokenizer
.
save_vocabulary
(
args
.
output_dir
)
# Load a trained model and vocabulary that you have fine-tuned
model
=
XLNetForSequenceClassification
.
from_pretrained
(
args
.
output_dir
,
num_labels
=
num_labels
)
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
args
.
output_dir
,
do_lower_case
=
args
.
do_lower_case
)
# Good practice: save your training arguments together with the trained model
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
torch
.
save
(
args
,
output_args_file
)
else
:
model
=
XLNetForSequenceClassification
.
from_pretrained
(
args
.
xlnet_model
,
num_labels
=
num_labels
)
model
.
to
(
device
)
### Evaluation
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
cached_eval_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'dev_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
xlnet_model
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
if
os
.
path
.
exists
(
cached_eval_features_file
):
logger
.
info
(
"Loading eval features for cache file %s"
,
cached_eval_features_file
)
with
open
(
cached_eval_features_file
,
"rb"
)
as
reader
:
eval_features
=
pickle
.
load
(
reader
)
else
:
logger
.
info
(
"No cache file at %s, preparing eval features"
,
cached_eval_features_file
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
,
cls_token_at_end
=
True
,
cls_token
=
tokenizer
.
cls_token
,
sep_token
=
tokenizer
.
sep_token
,
cls_token_segment_id
=
2
,
pad_on_left
=
True
,
pad_token_segment_id
=
4
)
if
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
:
logger
.
info
(
" Saving eval features into cached file %s"
,
cached_eval_features_file
)
with
open
(
cached_eval_features_file
,
"wb"
)
as
writer
:
pickle
.
dump
(
eval_features
,
writer
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
if
args
.
local_rank
==
-
1
:
eval_sampler
=
SequentialSampler
(
eval_data
)
else
:
eval_sampler
=
DistributedSampler
(
eval_data
)
# Note that this sampler samples randomly
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
out_label_ids
=
None
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
,
_
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
)
# create eval loss and other metric required by the task
if
output_mode
==
"classification"
:
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
out_label_ids
=
label_ids
.
detach
().
cpu
().
numpy
()
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
label_ids
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
if
output_mode
==
"classification"
:
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
elif
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
task_name
,
preds
,
out_label_ids
)
loss
=
curr_tr_loss
/
curr_steps
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
# hack for MNLI-MM
if
task_name
==
"mnli"
:
task_name
=
"mnli-mm"
processor
=
processors
[
task_name
]()
if
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
)
and
os
.
listdir
(
args
.
output_dir
+
'-MM'
)
and
args
.
do_train
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
):
os
.
makedirs
(
args
.
output_dir
+
'-MM'
)
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
out_label_ids
=
None
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
,
_
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
,
labels
=
None
)
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
out_label_ids
=
label_ids
.
detach
().
cpu
().
numpy
()
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
label_ids
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
result
=
compute_metrics
(
task_name
,
preds
,
out_label_ids
)
loss
=
curr_tr_loss
/
curr_steps
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
+
'-MM'
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
main
()
examples/utils.py
deleted
100644 → 0
View file @
ec07cf5a
# Copyright (c) 2019-present, the HuggingFace Inc. authors.
# All rights reserved. This source code is licensed under the BSD-style
# license found in the LICENSE file in the root directory of this source tree.
import
logging
import
os
from
tqdm
import
tqdm
from
pprint
import
pformat
import
torch
from
ignite.engine
import
Engine
,
Events
from
ignite.handlers
import
ModelCheckpoint
from
ignite.metrics
import
RunningAverage
from
ignite.contrib.handlers
import
ProgressBar
from
ignite.contrib.handlers.tensorboard_logger
import
OptimizerParamsHandler
,
OutputHandler
,
TensorboardLogger
def
average_distributed_scalar
(
scalar
,
args
):
""" Average a scalar over nodes if we are in distributed training.
We use this for distributed evaluation.
Beware, such averages only works for metrics which are additive with regard
to the evaluation dataset, e.g. accuracy, log probabilities.
Doesn't work for ratio metrics like F1.
"""
if
args
.
local_rank
==
-
1
:
return
scalar
scalar_t
=
torch
.
tensor
(
scalar
,
dtype
=
torch
.
float
,
device
=
args
.
device
)
/
torch
.
distributed
.
get_world_size
()
torch
.
distributed
.
all_reduce
(
scalar_t
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
)
return
scalar_t
.
item
()
def
add_logging_and_checkpoint_saving
(
trainer
,
evaluator
,
metrics
,
model
,
optimizer
,
args
,
prefix
=
""
):
""" Add to a PyTorch ignite training engine tensorboard logging,
progress bar with average loss, checkpoint saving and save training config.
"""
# Add progress bar with average loss
RunningAverage
(
output_transform
=
lambda
x
:
x
).
attach
(
trainer
,
prefix
+
"loss"
)
pbar
=
ProgressBar
(
persist
=
True
)
pbar
.
attach
(
trainer
,
metric_names
=
[
prefix
+
"loss"
])
evaluator
.
add_event_handler
(
Events
.
COMPLETED
,
lambda
_
:
pbar
.
log_message
(
"Validation: %s"
%
pformat
(
evaluator
.
state
.
metrics
)))
# Add tensorboard logging with training and evaluation metrics
tb_logger
=
TensorboardLogger
(
log_dir
=
None
)
tb_logger
.
attach
(
trainer
,
log_handler
=
OutputHandler
(
tag
=
"training"
,
metric_names
=
[
prefix
+
"loss"
]),
event_name
=
Events
.
ITERATION_COMPLETED
)
tb_logger
.
attach
(
trainer
,
log_handler
=
OptimizerParamsHandler
(
optimizer
),
event_name
=
Events
.
ITERATION_STARTED
)
@
evaluator
.
on
(
Events
.
COMPLETED
)
def
tb_log_metrics
(
engine
):
for
name
in
metrics
.
keys
():
tb_logger
.
writer
.
add_scalar
(
name
,
engine
.
state
.
metrics
[
name
],
trainer
.
state
.
iteration
)
# Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()')
checkpoint_handler
=
ModelCheckpoint
(
tb_logger
.
writer
.
log_dir
,
'checkpoint'
,
save_interval
=
1
,
n_saved
=
3
)
trainer
.
add_event_handler
(
Events
.
EPOCH_COMPLETED
,
checkpoint_handler
,
{
'mymodel'
:
getattr
(
model
,
'module'
,
model
)})
# Save training configuration
torch
.
save
(
args
,
os
.
path
.
join
(
tb_logger
.
writer
.
log_dir
,
CONFIG_NAME
))
return
checkpoint_handler
,
tb_logger
pytorch_transformers/optimization.py
View file @
b21d84b0
...
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
...
@@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
class
ConstantLRSchedule
(
LambdaLR
):
class
ConstantLRSchedule
(
LambdaLR
):
def
__init__
(
self
,
optimizer
,
last_epoch
=-
1
):
def
__init__
(
self
,
optimizer
,
last_epoch
=-
1
):
super
(
ConstantLR
,
self
).
__init__
(
optimizer
,
lambda
x
:
x
,
last_epoch
=
last_epoch
)
super
(
ConstantLR
Schedule
,
self
).
__init__
(
optimizer
,
lambda
x
:
x
,
last_epoch
=
last_epoch
)
class
WarmupCosineSchedule
(
LambdaLR
):
class
WarmupCosineSchedule
(
LambdaLR
):
"""
"""
...
@@ -128,7 +128,7 @@ class AdamW(Optimizer):
...
@@ -128,7 +128,7 @@ class AdamW(Optimizer):
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
defaults
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
correct_bias
=
correct_bias
)
correct_bias
=
correct_bias
)
super
(
Bert
Adam
,
self
).
__init__
(
params
,
defaults
)
super
(
Adam
W
,
self
).
__init__
(
params
,
defaults
)
def
step
(
self
,
closure
=
None
):
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step.
"""Performs a single optimization step.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment