Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
50b7e52a
"examples/vscode:/vscode.git/clone" did not exist on "562b6369c4a009c748ca86ce05d6ae5b7670e705"
Commit
50b7e52a
authored
Jul 10, 2019
by
thomwolf
Browse files
WIP examples
parent
ed6c8d37
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
370 additions
and
218 deletions
+370
-218
examples/run_glue.py
examples/run_glue.py
+117
-117
examples/run_squad.py
examples/run_squad.py
+189
-100
examples/utils.py
examples/utils.py
+61
-0
pytorch_transformers/modeling_xlnet.py
pytorch_transformers/modeling_xlnet.py
+1
-1
pytorch_transformers/optimization.py
pytorch_transformers/optimization.py
+2
-0
No files found.
examples/run_glue.py
View file @
50b7e52a
...
@@ -37,7 +37,7 @@ from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenc
...
@@ -37,7 +37,7 @@ from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenc
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
pytorch_transformers
import
(
BertTokenizer
,
XLNetTokenizer
,
from
pytorch_transformers
import
(
BertTokenizer
,
XLNetTokenizer
,
XLMTokenizer
)
XLMTokenizer
)
from
pytorch_transformers.optimization
import
BertAdam
,
WarmupLinearSchedule
from
pytorch_transformers.optimization
import
BertAdam
from
utils_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
from
utils_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
...
@@ -60,12 +60,12 @@ TOKENIZER_CLASSES = {
...
@@ -60,12 +60,12 @@ TOKENIZER_CLASSES = {
'xlm'
:
XLMTokenizer
,
'xlm'
:
XLMTokenizer
,
}
}
def
train
(
args
,
train_dataset
,
model
):
def
train
(
args
,
train_dataset
,
model
,
tokenizer
):
""" Train the model """
""" Train the model """
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
=
SummaryWriter
()
tb_writer
=
SummaryWriter
()
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
args
.
train_batch_size
=
args
.
per_gpu_
train_batch_size
*
args
.
n_gpu
train_sampler
=
RandomSampler
(
train_dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
train_dataset
)
train_sampler
=
RandomSampler
(
train_dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
train_dataset
)
train_dataloader
=
DataLoader
(
train_dataset
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
train_dataloader
=
DataLoader
(
train_dataset
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
...
@@ -76,42 +76,36 @@ def train(args, train_dataset, model):
...
@@ -76,42 +76,36 @@ def train(args, train_dataset, model):
num_train_optimization_steps
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
num_train_optimization_steps
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.weight'
]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
args
.
weight_decay
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
]
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
t_total
=
num_train_optimization_steps
,
warmup
=
args
.
warmup_proportion
)
if
args
.
fp16
:
if
args
.
fp16
:
try
:
try
:
from
apex
.optimizers
import
FP16_Optimizer
,
FusedAd
am
from
apex
import
am
p
except
ImportError
:
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
)
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
model
,
optimizer
=
amp
.
initialize
(
model
,
optimizer
,
opt_level
=
args
.
fp16_opt_level
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
# Train!
# Train!
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_dataset
))
logger
.
info
(
" Num examples = %d"
,
len
(
train_dataset
))
logger
.
info
(
" Num Epochs = %d"
,
args
.
num_train_epochs
)
logger
.
info
(
" Num Epochs = %d"
,
args
.
num_train_epochs
)
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Instantaneous batch size per GPU = %d"
,
args
.
per_gpu_train_batch_size
)
logger
.
info
(
" Total train batch size (w. parallel, distributed & accumulation) = %d"
,
args
.
train_batch_size
*
args
.
gradient_accumulation_steps
*
(
torch
.
distributed
.
get_world_size
()
if
args
.
local_rank
!=
-
1
else
1
))
logger
.
info
(
" Gradient Accumulation steps = %d"
,
args
.
gradient_accumulation_steps
)
logger
.
info
(
" Gradient Accumulation steps = %d"
,
args
.
gradient_accumulation_steps
)
logger
.
info
(
" Total optimization steps = %d"
,
num_train_optimization_steps
)
logger
.
info
(
" Total optimization steps = %d"
,
num_train_optimization_steps
)
global_step
=
0
global_step
=
0
tr_loss
=
0
tr_loss
,
logging_loss
=
0.0
,
0.0
model
.
train
()
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
model
.
train
()
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
inputs
=
{
'input_ids'
:
batch
[
0
],
inputs
=
{
'input_ids'
:
batch
[
0
],
'attention_mask'
:
batch
[
1
],
'attention_mask'
:
batch
[
1
],
...
@@ -125,23 +119,25 @@ def train(args, train_dataset, model):
...
@@ -125,23 +119,25 @@ def train(args, train_dataset, model):
if
args
.
gradient_accumulation_steps
>
1
:
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
=
loss
/
args
.
gradient_accumulation_steps
loss
.
backward
()
if
not
args
.
fp16
else
optimizer
.
backward
(
loss
)
if
args
.
fp16
:
with
amp
.
scale_loss
(
loss
,
optimizer
)
as
scaled_loss
:
scaled_loss
.
backward
()
else
:
loss
.
backward
()
tr_loss
+=
loss
.
item
()
tr_loss
+=
loss
.
item
()
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
step
()
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
global_step
+=
1
global_step
+=
1
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]
and
args
.
logging_steps
>
0
and
global_step
%
args
.
logging_steps
==
0
:
if
not
args
.
fp16
:
if
args
.
local_rank
==
-
1
:
# Only evaluate on single GPU otherwise metrics may not average well
results
=
evaluate
(
args
,
model
,
tokenizer
)
for
key
,
value
in
results
.
items
():
tb_writer
.
add_scalar
(
'eval_{}'
.
format
(
key
),
value
,
global_step
)
tb_writer
.
add_scalar
(
'lr'
,
optimizer
.
get_lr
()[
0
],
global_step
)
tb_writer
.
add_scalar
(
'lr'
,
optimizer
.
get_lr
()[
0
],
global_step
)
tb_writer
.
add_scalar
(
'loss'
,
loss
.
item
(),
global_step
)
tb_writer
.
add_scalar
(
'loss'
,
(
tr_loss
-
logging_loss
)
/
args
.
logging_steps
,
global_step
)
logging_loss
=
tr_loss
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
break
break
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
...
@@ -150,18 +146,26 @@ def train(args, train_dataset, model):
...
@@ -150,18 +146,26 @@ def train(args, train_dataset, model):
return
global_step
,
tr_loss
/
global_step
return
global_step
,
tr_loss
/
global_step
def
evalutate
(
args
,
eval_task
,
eval_output_dir
,
dataset
,
model
):
def
evaluate
(
args
,
model
,
tokenizer
):
# Loop to handle MNLI double evaluation (matched, mis-matched)
eval_task_names
=
(
"mnli"
,
"mnli-mm"
)
if
args
.
task_name
==
"mnli"
else
(
args
.
task_name
,)
eval_outputs_dirs
=
(
args
.
output_dir
,
args
.
output_dir
+
'-MM'
)
if
args
.
task_name
==
"mnli"
else
(
args
.
output_dir
,)
results
=
{}
for
eval_task
,
eval_output_dir
in
zip
(
eval_task_names
,
eval_outputs_dirs
):
eval_dataset
=
load_and_cache_examples
(
args
,
eval_task
,
tokenizer
,
evaluate
=
True
)
""" Evaluate the model """
""" Evaluate the model """
if
not
os
.
path
.
exists
(
eval_output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
if
not
os
.
path
.
exists
(
eval_output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
eval_output_dir
)
os
.
makedirs
(
eval_output_dir
)
# Note that DistributedSampler samples randomly
# Note that DistributedSampler samples randomly
eval_sampler
=
SequentialSampler
(
dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
dataset
)
eval_sampler
=
SequentialSampler
(
eval_
dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
eval_
dataset
)
eval_dataloader
=
DataLoader
(
dataset
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
eval_dataloader
=
DataLoader
(
eval_
dataset
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
# Eval!
# Eval!
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
dataset
))
logger
.
info
(
" Num examples = %d"
,
len
(
eval_
dataset
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
model
.
eval
()
model
.
eval
()
eval_loss
=
0
eval_loss
=
0
...
@@ -194,6 +198,7 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
...
@@ -194,6 +198,7 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
elif
args
.
output_mode
==
"regression"
:
elif
args
.
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
eval_task
,
preds
,
out_label_ids
)
result
=
compute_metrics
(
eval_task
,
preds
,
out_label_ids
)
results
.
update
(
result
)
output_eval_file
=
os
.
path
.
join
(
eval_output_dir
,
"eval_results.txt"
)
output_eval_file
=
os
.
path
.
join
(
eval_output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
with
open
(
output_eval_file
,
"w"
)
as
writer
:
...
@@ -202,10 +207,10 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
...
@@ -202,10 +207,10 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
return
result
return
result
s
def
load_and_cache_examples
(
args
,
task
,
tokenizer
,
evaluate
=
False
):
def
load_and_cache_examples
(
args
,
task
,
tokenizer
,
evaluate
=
False
,
overwrite_cache
=
False
):
processor
=
processors
[
task
]()
processor
=
processors
[
task
]()
output_mode
=
output_modes
[
task
]
output_mode
=
output_modes
[
task
]
# Load data features from cache or dataset file
# Load data features from cache or dataset file
...
@@ -214,7 +219,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -214,7 +219,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
list
(
filter
(
None
,
args
.
model_name
.
split
(
'/'
))).
pop
(),
list
(
filter
(
None
,
args
.
model_name
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
args
.
max_seq_length
),
str
(
task
)))
str
(
task
)))
if
os
.
path
.
exists
(
cached_features_file
):
if
os
.
path
.
exists
(
cached_features_file
)
and
not
args
.
overwrite_cache
:
logger
.
info
(
"Loading features from cached file %s"
,
cached_features_file
)
logger
.
info
(
"Loading features from cached file %s"
,
cached_features_file
)
features
=
torch
.
load
(
cached_features_file
)
features
=
torch
.
load
(
cached_features_file
)
else
:
else
:
...
@@ -270,39 +275,44 @@ def main():
...
@@ -270,39 +275,44 @@ def main():
help
=
"Whether to run eval on the dev set."
)
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--weight_decay"
,
default
=
0.0
,
type
=
float
,
help
=
"Weight deay if we apply some."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--max_steps"
,
default
=-
1
,
type
=
int
,
parser
.
add_argument
(
"--max_steps"
,
default
=-
1
,
type
=
int
,
help
=
"If > 0: set total number of training steps to perform. Override num_train_epochs."
)
help
=
"If > 0: set total number of training steps to perform. Override num_train_epochs."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training with linear learning rate warmup (0.1 = 10%% of training)."
)
help
=
"Proportion of training with linear learning rate warmup (0.1 = 10%% of training)."
)
parser
.
add_argument
(
'--logging_steps'
,
type
=
int
,
default
=
100
,
help
=
"Log every X updates steps."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Avoid using CUDA when available"
)
help
=
"Avoid using CUDA when available"
)
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
help
=
"Overwrite the content of the output directory"
)
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
'--overwrite_cache'
,
action
=
'store_true'
,
help
=
"Overwrite the cached training and evaluation sets"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
help
=
"Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
parser
.
add_argument
(
'--fp16_opt_level'
,
type
=
str
,
default
=
'O1'
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
help
=
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"0 (default value): dynamic loss scaling.
\n
"
"See details at https://nvidia.github.io/apex/amp.html"
)
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
help
=
"For distributed training: local_rank"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"For distant debugging."
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"For distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
...
@@ -362,13 +372,10 @@ def main():
...
@@ -362,13 +372,10 @@ def main():
if
args
.
local_rank
==
0
:
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
torch
.
distributed
.
barrier
()
# Distributed, parrallel and fp16 model
# Distributed and parrallel training
if
args
.
fp16
:
model
.
half
()
model
.
to
(
args
.
device
)
model
.
to
(
args
.
device
)
if
args
.
local_rank
!=
-
1
:
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
find_unused_parameters
=
True
)
elif
args
.
n_gpu
>
1
:
elif
args
.
n_gpu
>
1
:
...
@@ -377,7 +384,7 @@ def main():
...
@@ -377,7 +384,7 @@ def main():
# Training
# Training
if
args
.
do_train
:
if
args
.
do_train
:
train_dataset
=
load_and_cache_examples
(
args
,
args
.
task_name
,
tokenizer
,
evaluate
=
False
)
train_dataset
=
load_and_cache_examples
(
args
,
args
.
task_name
,
tokenizer
,
evaluate
=
False
)
global_step
,
tr_loss
=
train
(
args
,
train_dataset
,
model
)
global_step
,
tr_loss
=
train
(
args
,
train_dataset
,
model
,
tokenizer
)
logger
.
info
(
" global_step = %s, average loss = %s"
,
global_step
,
tr_loss
)
logger
.
info
(
" global_step = %s, average loss = %s"
,
global_step
,
tr_loss
)
...
@@ -402,17 +409,10 @@ def main():
...
@@ -402,17 +409,10 @@ def main():
model
.
to
(
args
.
device
)
model
.
to
(
args
.
device
)
# Evaluation
# Evaluation
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
if
args
.
do_eval
and
args
.
local_rank
in
[
-
1
,
0
]:
# Handle MNLI double evaluation
results
=
evaluate
(
args
,
model
,
tokenizer
)
eval_task_names
=
(
"mnli"
,
"mnli-mm"
)
if
args
.
task_name
==
"mnli"
else
(
args
.
task_name
,)
eval_outputs_dirs
=
(
args
.
output_dir
,
args
.
output_dir
+
'-MM'
)
if
args
.
task_name
==
"mnli"
else
(
args
.
output_dir
,)
for
eval_task
,
eval_output_dir
in
zip
(
eval_task_names
,
eval_outputs_dirs
):
eval_dataset
=
load_and_cache_examples
(
args
,
eval_task
,
tokenizer
,
evaluate
=
True
)
result
=
evalutate
(
args
,
eval_task
,
eval_output_dir
,
eval_dataset
,
model
)
return
result
return
result
s
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
examples/run_squad.py
View file @
50b7e52a
...
@@ -33,36 +33,156 @@ from tqdm import tqdm, trange
...
@@ -33,36 +33,156 @@ from tqdm import tqdm, trange
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
from
pytorch_transformers
import
WEIGHTS_NAME
,
CONFIG_NAME
from
pytorch_transformers
import
(
BertForQuestionAnswering
,
XLNetForQuestionAnswering
,
from
pytorch_transformers.modeling_bert
import
BertForQuestionAnswering
XLMForQuestionAnswering
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
from
pytorch_transformers.optimization
import
BertAdam
,
WarmupLinearSchedule
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
pytorch_transformers.tokenization_bert
import
BertTokenizer
from
pytorch_transformers
import
(
BertTokenizer
,
XLNetTokenizer
,
XLMTokenizer
)
from
utils_squad
import
read_squad_examples
,
convert_examples_to_features
,
RawResult
,
write_predictions
from
utils_squad
import
read_squad_examples
,
convert_examples_to_features
,
RawResult
,
write_predictions
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
else
:
import
pickle
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
ALL_MODELS
=
sum
((
tuple
(
m
.
keys
())
for
m
in
(
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP
)),
())
MODEL_CLASSES
=
{
'bert'
:
BertForQuestionAnswering
,
'xlnet'
:
XLNetForQuestionAnswering
,
'xlm'
:
XLMForQuestionAnswering
,
}
TOKENIZER_CLASSES
=
{
'bert'
:
BertTokenizer
,
'xlnet'
:
XLNetTokenizer
,
'xlm'
:
XLMTokenizer
,
}
def
train
(
args
,
train_dataset
,
model
):
""" Train the model """
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
=
SummaryWriter
()
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
train_sampler
=
RandomSampler
(
train_dataset
)
if
args
.
local_rank
==
-
1
else
DistributedSampler
(
train_dataset
)
train_dataloader
=
DataLoader
(
train_dataset
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
if
args
.
max_steps
>
0
:
num_train_optimization_steps
=
args
.
max_steps
args
.
num_train_epochs
=
args
.
max_steps
//
(
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
)
+
1
else
:
num_train_optimization_steps
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
# Prepare optimizer
no_decay
=
[
'bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
t_total
=
num_train_optimization_steps
,
warmup
=
args
.
warmup_proportion
)
if
args
.
fp16
:
try
:
from
apex
import
amp
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
)
model
,
optimizer
=
amp
.
initialize
(
model
,
optimizer
,
opt_level
=
args
.
fp16_opt_level
)
# Train!
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_dataset
))
logger
.
info
(
" Num Epochs = %d"
,
args
.
num_train_epochs
)
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Total batch size (distributed) = %d"
,
args
.
train_batch_size
*
(
torch
.
distributed
.
get_world_size
()
if
args
.
local_rank
!=
-
1
else
1
))
logger
.
info
(
" Gradient Accumulation steps = %d"
,
args
.
gradient_accumulation_steps
)
logger
.
info
(
" Total optimization steps = %d"
,
num_train_optimization_steps
)
global_step
=
0
tr_loss
,
logging_loss
=
0.0
,
0.0
model
.
train
()
optimizer
.
zero_grad
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
inputs
=
{
'input_ids'
:
batch
[
0
],
'attention_mask'
:
batch
[
1
],
'token_type_ids'
:
batch
[
2
]
if
args
.
model_type
in
[
'bert'
,
'xlnet'
]
else
None
,
# XLM don't use segment_ids
'labels'
:
batch
[
3
]}
ouputs
=
model
(
**
inputs
)
loss
=
ouputs
[
0
]
def
evalutate
(
args
,
dataset
,
model
):
""" Evaluate the model """
def
load_and_cache_examples
(
args
,
tokenizer
,
training
=
True
):
""" Load data features from cache or dataset file. """
cached_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'cached_{}_{}_{}_{}'
.
format
(
'dev'
if
evaluate
else
'train'
,
list
(
filter
(
None
,
args
.
model_name
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task
)))
if
os
.
path
.
exists
(
cached_features_file
):
logger
.
info
(
"Loading features from cached file %s"
,
cached_features_file
)
features
=
torch
.
load
(
cached_features_file
)
else
:
logger
.
info
(
"Creating features from dataset file at %s"
,
args
.
data_dir
)
label_list
=
processor
.
get_labels
()
examples
=
read_squad_examples
(
input_file
=
args
.
train_file
if
training
else
args
.
predict_file
,
is_training
=
training
,
version_2_with_negative
=
args
.
version_2_with_negative
)
features
=
convert_examples_to_features
(
examples
=
examples
,
tokenizer
=
tokenizer
,
max_seq_length
=
args
.
max_seq_length
,
doc_stride
=
args
.
doc_stride
,
max_query_length
=
args
.
max_query_length
,
is_training
=
training
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Num orig examples = %d"
,
len
(
examples
))
logger
.
info
(
"Num split examples = %d"
,
len
(
features
))
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
torch
.
save
(
features
,
cached_features_file
)
# Convert to Tensors and build dataset
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
if
training
:
all_start_positions
=
torch
.
tensor
([
f
.
start_position
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_end_positions
=
torch
.
tensor
([
f
.
end_position
for
f
in
train_features
],
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_start_positions
,
all_end_positions
)
else
:
all_example_index
=
torch
.
arange
(
all_input_ids
.
size
(
0
),
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_example_index
)
return
dataset
def
main
():
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
## Required parameters
## Required parameters
parser
.
add_argument
(
"--bert_model"
,
default
=
None
,
type
=
str
,
required
=
True
,
parser
.
add_argument
(
"--train_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert pre-trained model selected in the list: bert-base-uncased, "
help
=
"SQuAD json for training. E.g., train-v1.1.json"
)
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
parser
.
add_argument
(
"--predict_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
"bert-base-multilingual-cased, bert-base-chinese."
)
help
=
"SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
)
parser
.
add_argument
(
"--model_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert/XLNet/XLM pre-trained model selected in the list: "
+
", "
.
join
(
ALL_MODELS
))
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model checkpoints and predictions will be written."
)
help
=
"The output directory where the model checkpoints and predictions will be written."
)
## Other parameters
## Other parameters
parser
.
add_argument
(
"--train_file"
,
default
=
None
,
type
=
str
,
help
=
"SQuAD json for training. E.g., train-v1.1.json"
)
parser
.
add_argument
(
'--version_2_with_negative'
,
action
=
'store_true'
,
parser
.
add_argument
(
"--predict_file"
,
default
=
None
,
type
=
str
,
help
=
'If true, the SQuAD examples contain some that do not have an answer.'
)
help
=
"SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
)
parser
.
add_argument
(
'--null_score_diff_threshold'
,
type
=
float
,
default
=
0.0
,
help
=
"If null_score - best_non_null is greater than the threshold predict null."
)
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
384
,
type
=
int
,
parser
.
add_argument
(
"--max_seq_length"
,
default
=
384
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization. Sequences "
help
=
"The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded."
)
"longer than this will be truncated, and sequences shorter than this will be padded."
)
...
@@ -71,65 +191,53 @@ def main():
...
@@ -71,65 +191,53 @@ def main():
parser
.
add_argument
(
"--max_query_length"
,
default
=
64
,
type
=
int
,
parser
.
add_argument
(
"--max_query_length"
,
default
=
64
,
type
=
int
,
help
=
"The maximum number of tokens for the question. Questions longer than this will "
help
=
"The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length."
)
"be truncated to this length."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
parser
.
add_argument
(
"--do_predict"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--do_predict"
,
action
=
'store_true'
,
parser
.
add_argument
(
"--predict_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for predictions."
)
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. True for uncased models, False for cased models."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--predict_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for predictions."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
help
=
"Proportion of training with linear learning rate warmup (0.1 = 10%% of training)."
)
"of training."
)
parser
.
add_argument
(
"--n_best_size"
,
default
=
20
,
type
=
int
,
parser
.
add_argument
(
"--n_best_size"
,
default
=
20
,
type
=
int
,
help
=
"The total number of n-best predictions to generate in the nbest_predictions.json "
help
=
"The total number of n-best predictions to generate in the nbest_predictions.json output file."
)
"output file."
)
parser
.
add_argument
(
"--max_answer_length"
,
default
=
30
,
type
=
int
,
parser
.
add_argument
(
"--max_answer_length"
,
default
=
30
,
type
=
int
,
help
=
"The maximum length of an answer that can be generated. This is needed because the start "
help
=
"The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another."
)
"and end predictions are not conditioned on one another."
)
parser
.
add_argument
(
"--verbose_logging"
,
action
=
'store_true'
,
parser
.
add_argument
(
"--verbose_logging"
,
action
=
'store_true'
,
help
=
"If true, all of the warnings related to data processing will be printed. "
help
=
"If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation."
)
"A number of warnings are expected for a normal SQuAD evaluation."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
'--seed'
,
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. True for uncased models, False for cased models."
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--fp16'
,
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
)
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--fp16_opt_level'
,
type
=
str
,
default
=
'O1'
,
parser
.
add_argument
(
'--overwrite_output_dir'
,
help
=
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
action
=
'store_true'
,
"See details at https://nvidia.github.io/apex/amp.html"
)
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
'--version_2_with_negative'
,
action
=
'store_true'
,
help
=
'If true, the SQuAD examples contain some that do not have an answer.'
)
parser
.
add_argument
(
'--null_score_diff_threshold'
,
type
=
float
,
default
=
0.0
,
help
=
"If null_score - best_non_null is greater than the threshold predict null."
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
args
)
print
(
args
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
if
args
.
server_ip
and
args
.
server_port
:
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
import
ptvsd
...
@@ -137,71 +245,52 @@ def main():
...
@@ -137,71 +245,52 @@ def main():
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
ptvsd
.
wait_for_attach
()
# Setup CUDA, GPU & distributed training
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
args
.
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
else
:
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
cuda
.
set_device
(
args
.
local_rank
)
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
args
.
n_gpu
=
1
args
.
device
=
device
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
# Setup logging
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
logging
.
basicConfig
(
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
warning
(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s"
,
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
# Setup seeds
random
.
seed
(
args
.
seed
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
if
args
.
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_predict
:
# Load pretrained model and tokenizer
raise
ValueError
(
"At least one of `do_train` or `do_predict` must be True."
)
if
args
.
do_train
:
if
not
args
.
train_file
:
raise
ValueError
(
"If `do_train` is True, then `train_file` must be specified."
)
if
args
.
do_predict
:
if
not
args
.
predict_file
:
raise
ValueError
(
"If `do_predict` is True, then `predict_file` must be specified."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
):
os
.
makedirs
(
args
.
output_dir
)
if
args
.
local_rank
not
in
[
-
1
,
0
]:
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
torch
.
distributed
.
barrier
()
# Make sure only 1st process in distributed training download model & vocab
args
.
model_type
=
args
.
model_name
.
lower
().
split
(
'-'
)[
0
]
tokenizer_class
=
TOKENIZER_CLASSES
[
args
.
model_type
]
model_class
=
MODEL_CLASSES
[
args
.
model_type
]
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
model_name
,
do_lower_case
=
args
.
do_lower_case
)
model
=
model_class
.
from_pretrained
(
args
.
model_name
,
num_labels
=
num_labels
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
model
=
BertForQuestionAnswering
.
from_pretrained
(
args
.
bert_model
)
if
args
.
local_rank
==
0
:
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
torch
.
distributed
.
barrier
()
if
args
.
fp16
:
# Distributed and parrallel training
model
.
half
()
model
.
to
(
args
.
device
)
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
find_unused_parameters
=
True
)
elif
n_gpu
>
1
:
elif
args
.
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
# Training
if
args
.
do_train
:
if
args
.
do_train
:
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
=
SummaryWriter
()
tb_writer
=
SummaryWriter
()
...
...
examples/utils.py
0 → 100644
View file @
50b7e52a
# Copyright (c) 2019-present, the HuggingFace Inc. authors.
# All rights reserved. This source code is licensed under the BSD-style
# license found in the LICENSE file in the root directory of this source tree.
import
logging
import
os
from
tqdm
import
tqdm
from
pprint
import
pformat
import
torch
from
ignite.engine
import
Engine
,
Events
from
ignite.handlers
import
ModelCheckpoint
from
ignite.metrics
import
RunningAverage
from
ignite.contrib.handlers
import
ProgressBar
from
ignite.contrib.handlers.tensorboard_logger
import
OptimizerParamsHandler
,
OutputHandler
,
TensorboardLogger
def
average_distributed_scalar
(
scalar
,
args
):
""" Average a scalar over nodes if we are in distributed training.
We use this for distributed evaluation.
Beware, such averages only works for metrics which are additive with regard
to the evaluation dataset, e.g. accuracy, log probabilities.
Doesn't work for ratio metrics like F1.
"""
if
args
.
local_rank
==
-
1
:
return
scalar
scalar_t
=
torch
.
tensor
(
scalar
,
dtype
=
torch
.
float
,
device
=
args
.
device
)
/
torch
.
distributed
.
get_world_size
()
torch
.
distributed
.
all_reduce
(
scalar_t
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
)
return
scalar_t
.
item
()
def
add_logging_and_checkpoint_saving
(
trainer
,
evaluator
,
metrics
,
model
,
optimizer
,
args
,
prefix
=
""
):
""" Add to a PyTorch ignite training engine tensorboard logging,
progress bar with average loss, checkpoint saving and save training config.
"""
# Add progress bar with average loss
RunningAverage
(
output_transform
=
lambda
x
:
x
).
attach
(
trainer
,
prefix
+
"loss"
)
pbar
=
ProgressBar
(
persist
=
True
)
pbar
.
attach
(
trainer
,
metric_names
=
[
prefix
+
"loss"
])
evaluator
.
add_event_handler
(
Events
.
COMPLETED
,
lambda
_
:
pbar
.
log_message
(
"Validation: %s"
%
pformat
(
evaluator
.
state
.
metrics
)))
# Add tensorboard logging with training and evaluation metrics
tb_logger
=
TensorboardLogger
(
log_dir
=
None
)
tb_logger
.
attach
(
trainer
,
log_handler
=
OutputHandler
(
tag
=
"training"
,
metric_names
=
[
prefix
+
"loss"
]),
event_name
=
Events
.
ITERATION_COMPLETED
)
tb_logger
.
attach
(
trainer
,
log_handler
=
OptimizerParamsHandler
(
optimizer
),
event_name
=
Events
.
ITERATION_STARTED
)
@
evaluator
.
on
(
Events
.
COMPLETED
)
def
tb_log_metrics
(
engine
):
for
name
in
metrics
.
keys
():
tb_logger
.
writer
.
add_scalar
(
name
,
engine
.
state
.
metrics
[
name
],
trainer
.
state
.
iteration
)
# Add checkpoint saving after each epoch - take care of distributed encapsulation ('getattr()')
checkpoint_handler
=
ModelCheckpoint
(
tb_logger
.
writer
.
log_dir
,
'checkpoint'
,
save_interval
=
1
,
n_saved
=
3
)
trainer
.
add_event_handler
(
Events
.
EPOCH_COMPLETED
,
checkpoint_handler
,
{
'mymodel'
:
getattr
(
model
,
'module'
,
model
)})
# Save training configuration
torch
.
save
(
args
,
os
.
path
.
join
(
tb_logger
.
writer
.
log_dir
,
CONFIG_NAME
))
return
checkpoint_handler
,
tb_logger
pytorch_transformers/modeling_xlnet.py
View file @
50b7e52a
...
@@ -393,7 +393,7 @@ class XLNetRelativeAttention(nn.Module):
...
@@ -393,7 +393,7 @@ class XLNetRelativeAttention(nn.Module):
x
=
x
[
1
:,
...]
x
=
x
[
1
:,
...]
x
=
x
.
reshape
(
x_size
[
0
],
x_size
[
1
]
-
1
,
x_size
[
2
],
x_size
[
3
])
x
=
x
.
reshape
(
x_size
[
0
],
x_size
[
1
]
-
1
,
x_size
[
2
],
x_size
[
3
])
# x = x[:, 0:klen, :, :]
# x = x[:, 0:klen, :, :]
x
=
torch
.
index_select
(
x
,
1
,
torch
.
arange
(
klen
))
x
=
torch
.
index_select
(
x
,
1
,
torch
.
arange
(
klen
,
device
=
x
.
device
,
dtype
=
torch
.
long
))
return
x
return
x
...
...
pytorch_transformers/optimization.py
View file @
50b7e52a
...
@@ -227,6 +227,8 @@ class BertAdam(Optimizer):
...
@@ -227,6 +227,8 @@ class BertAdam(Optimizer):
lr
=
[]
lr
=
[]
for
group
in
self
.
param_groups
:
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
for
p
in
group
[
'params'
]:
if
p
.
grad
is
None
:
continue
state
=
self
.
state
[
p
]
state
=
self
.
state
[
p
]
if
len
(
state
)
==
0
:
if
len
(
state
)
==
0
:
return
[
0
]
return
[
0
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment