Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f6081f22
Commit
f6081f22
authored
Jun 24, 2019
by
thomwolf
Browse files
add xlnetforsequence classif and run_classifier example for xlnet
parent
c946bb51
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
625 additions
and
46 deletions
+625
-46
examples/bertology.py
examples/bertology.py
+1
-1
examples/run_bert_classifier.py
examples/run_bert_classifier.py
+1
-1
examples/run_bert_extract_features.py
examples/run_bert_extract_features.py
+0
-0
examples/run_bert_squad.py
examples/run_bert_squad.py
+1
-1
examples/run_xlnet_classifier.py
examples/run_xlnet_classifier.py
+539
-0
examples/utils_glue.py
examples/utils_glue.py
+0
-0
examples/utils_squad.py
examples/utils_squad.py
+1
-0
hubconfs/xlnet_hubconf.py
hubconfs/xlnet_hubconf.py
+33
-33
pytorch_pretrained_bert/modeling_xlnet.py
pytorch_pretrained_bert/modeling_xlnet.py
+49
-10
No files found.
examples/bertology.py
View file @
f6081f22
...
@@ -14,7 +14,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
...
@@ -14,7 +14,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
from
pytorch_pretrained_bert
import
BertForSequenceClassification
,
BertTokenizer
from
pytorch_pretrained_bert
import
BertForSequenceClassification
,
BertTokenizer
from
run_classifier_dataset_
utils
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
from
utils
_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
examples/run_classifier.py
→
examples/run_
bert_
classifier.py
View file @
f6081f22
...
@@ -39,7 +39,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
...
@@ -39,7 +39,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
WarmupLinearSchedule
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
WarmupLinearSchedule
from
run_classifier_dataset_
utils
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
from
utils
_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
if
sys
.
version_info
[
0
]
==
2
:
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
import
cPickle
as
pickle
...
...
examples/extract_features.py
→
examples/
run_bert_
extract_features.py
View file @
f6081f22
File moved
examples/run_squad.py
→
examples/run_
bert_
squad.py
View file @
f6081f22
...
@@ -38,7 +38,7 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
...
@@ -38,7 +38,7 @@ from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
WarmupLinearSchedule
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
WarmupLinearSchedule
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
run
_squad
_dataset_utils
import
read_squad_examples
,
convert_examples_to_features
,
RawResult
,
write_predictions
from
utils
_squad
import
read_squad_examples
,
convert_examples_to_features
,
RawResult
,
write_predictions
if
sys
.
version_info
[
0
]
==
2
:
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
import
cPickle
as
pickle
...
...
examples/run_xlnet_classifier.py
0 → 100644
View file @
f6081f22
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from
__future__
import
absolute_import
,
division
,
print_function
import
argparse
import
logging
import
os
import
sys
import
random
from
tqdm
import
tqdm
,
trange
import
numpy
as
np
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
from
torch.utils.data.distributed
import
DistributedSampler
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
tensorboardX
import
SummaryWriter
from
pytorch_pretrained_bert.file_utils
import
WEIGHTS_NAME
,
CONFIG_NAME
from
pytorch_pretrained_bert.modeling_xlnet
import
XLNetForSequenceClassification
from
pytorch_pretrained_bert.tokenization_xlnet
import
XLNetTokenizer
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
WarmupLinearSchedule
from
utils_glue
import
processors
,
output_modes
,
convert_examples_to_features
,
compute_metrics
if
sys
.
version_info
[
0
]
==
2
:
import
cPickle
as
pickle
else
:
import
pickle
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--xlnet_model"
,
default
=
"xlnet-large-cased"
,
type
=
str
,
help
=
"XLNet pre-trained model: currently only xlnet-large-cased."
)
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
## Other parameters
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
args
=
parser
.
parse_args
()
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
args
.
device
=
device
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
)
and
args
.
local_rank
in
[
-
1
,
0
]:
os
.
makedirs
(
args
.
output_dir
)
task_name
=
args
.
task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
args
.
xlnet_model
,
do_lower_case
=
args
.
do_lower_case
)
model
=
XLNetForSequenceClassification
.
from_pretrained
(
args
.
xlnet_model
,
num_labels
=
num_labels
)
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
global_step
=
0
nb_tr_steps
=
0
tr_loss
=
0
if
args
.
do_train
:
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
=
SummaryWriter
()
# Prepare data loader
train_examples
=
processor
.
get_train_examples
(
args
.
data_dir
)
cached_train_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'train_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
xlnet_model
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
try
:
with
open
(
cached_train_features_file
,
"rb"
)
as
reader
:
train_features
=
pickle
.
load
(
reader
)
except
:
train_features
=
convert_examples_to_features
(
train_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
if
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
:
logger
.
info
(
" Saving train features into cached file %s"
,
cached_train_features_file
)
with
open
(
cached_train_features_file
,
"wb"
)
as
writer
:
pickle
.
dump
(
train_features
,
writer
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
float
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
RandomSampler
(
train_data
)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
num_train_optimization_steps
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
if
args
.
fp16
:
try
:
from
apex.optimizers
import
FP16_Optimizer
from
apex.optimizers
import
FusedAdam
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_optimization_steps
)
model
.
train
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
]):
tr_loss
=
0
nb_tr_examples
,
nb_tr_steps
=
0
,
0
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# define a new function to compute loss values for both output_modes
logits
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
)
if
output_mode
==
"classification"
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
zero_grad
()
global_step
+=
1
if
args
.
local_rank
in
[
-
1
,
0
]:
tb_writer
.
add_scalar
(
'lr'
,
optimizer
.
get_lr
()[
0
],
global_step
)
tb_writer
.
add_scalar
(
'loss'
,
loss
.
item
(),
global_step
)
### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
### Example:
if
args
.
do_train
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
# Save a trained model, configuration and tokenizer
model_to_save
=
model
.
module
if
hasattr
(
model
,
'module'
)
else
model
# Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file
=
os
.
path
.
join
(
args
.
output_dir
,
WEIGHTS_NAME
)
output_config_file
=
os
.
path
.
join
(
args
.
output_dir
,
CONFIG_NAME
)
torch
.
save
(
model_to_save
.
state_dict
(),
output_model_file
)
model_to_save
.
config
.
to_json_file
(
output_config_file
)
tokenizer
.
save_vocabulary
(
args
.
output_dir
)
# Load a trained model and vocabulary that you have fine-tuned
model
=
XLNetForSequenceClassification
.
from_pretrained
(
args
.
output_dir
,
num_labels
=
num_labels
)
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
args
.
output_dir
,
do_lower_case
=
args
.
do_lower_case
)
# Good practice: save your training arguments together with the trained model
output_args_file
=
os
.
path
.
join
(
args
.
output_dir
,
'training_args.bin'
)
torch
.
save
(
args
,
output_args_file
)
else
:
model
=
XLNetForSequenceClassification
.
from_pretrained
(
args
.
xlnet_model
,
num_labels
=
num_labels
)
model
.
to
(
device
)
### Evaluation
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
cached_eval_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'dev_{0}_{1}_{2}'
.
format
(
list
(
filter
(
None
,
args
.
xlnet_model
.
split
(
'/'
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task_name
)))
try
:
with
open
(
cached_eval_features_file
,
"rb"
)
as
reader
:
eval_features
=
pickle
.
load
(
reader
)
except
:
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
if
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
:
logger
.
info
(
" Saving eval features into cached file %s"
,
cached_eval_features_file
)
with
open
(
cached_eval_features_file
,
"wb"
)
as
writer
:
pickle
.
dump
(
eval_features
,
writer
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
if
args
.
local_rank
==
-
1
:
eval_sampler
=
SequentialSampler
(
eval_data
)
else
:
eval_sampler
=
DistributedSampler
(
eval_data
)
# Note that this sampler samples randomly
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
out_label_ids
=
None
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
)
# create eval loss and other metric required by the task
if
output_mode
==
"classification"
:
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
out_label_ids
=
label_ids
.
detach
().
cpu
().
numpy
()
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
label_ids
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
if
output_mode
==
"classification"
:
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
elif
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
task_name
,
preds
,
out_label_ids
)
loss
=
tr_loss
/
global_step
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
# hack for MNLI-MM
if
task_name
==
"mnli"
:
task_name
=
"mnli-mm"
processor
=
processors
[
task_name
]()
if
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
)
and
os
.
listdir
(
args
.
output_dir
+
'-MM'
)
and
args
.
do_train
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty."
.
format
(
args
.
output_dir
))
if
not
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
):
os
.
makedirs
(
args
.
output_dir
+
'-MM'
)
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
out_label_ids
=
None
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
token_type_ids
=
segment_ids
,
attention_mask
=
input_mask
,
labels
=
None
)
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
out_label_ids
=
label_ids
.
detach
().
cpu
().
numpy
()
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
label_ids
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
result
=
compute_metrics
(
task_name
,
preds
,
out_label_ids
)
loss
=
tr_loss
/
global_step
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
+
'-MM'
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
main
()
examples/
run_classifier_dataset_
utils.py
→
examples/utils
_glue
.py
View file @
f6081f22
File moved
examples/
run
_squad
_dataset_utils
.py
→
examples/
utils
_squad.py
View file @
f6081f22
# coding=utf-8
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
...
...
hubconfs/xlnet_hubconf.py
View file @
f6081f22
...
@@ -3,7 +3,7 @@ from pytorch_pretrained_bert.modeling_xlnet import (
...
@@ -3,7 +3,7 @@ from pytorch_pretrained_bert.modeling_xlnet import (
XLNetConfig
,
XLNetConfig
,
XLNetModel
,
XLNetModel
,
XLNetLMHeadModel
,
XLNetLMHeadModel
,
XLNetForSequenceClassification
#
XLNetForSequenceClassification
)
)
# A lot of models share the same param doc. Use a decorator
# A lot of models share the same param doc. Use a decorator
...
@@ -135,35 +135,35 @@ def xlnetLMHeadModel(*args, **kwargs):
...
@@ -135,35 +135,35 @@ def xlnetLMHeadModel(*args, **kwargs):
return
model
return
model
@
_append_from_pretrained_docstring
(
xlnet_docstring
)
#
@_append_from_pretrained_docstring(xlnet_docstring)
def
xlnetForSequenceClassification
(
*
args
,
**
kwargs
):
#
def xlnetForSequenceClassification(*args, **kwargs):
"""
#
"""
xlnetModel is the basic XLNet Transformer model from
#
xlnetModel is the basic XLNet Transformer model from
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
#
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
#
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
Example:
#
Example:
# Load the tokenizer
#
# Load the tokenizer
>>> import torch
#
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
#
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
# Prepare tokenized input
#
# Prepare tokenized input
>>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
#
>>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
#
>>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
>>> tokenized_text1 = tokenizer.tokenize(text1)
#
>>> tokenized_text1 = tokenizer.tokenize(text1)
>>> tokenized_text2 = tokenizer.tokenize(text2)
#
>>> tokenized_text2 = tokenizer.tokenize(text2)
>>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
#
>>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
>>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
#
>>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
>>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
#
>>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
>>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
#
>>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
# Load xlnetForSequenceClassification
#
# Load xlnetForSequenceClassification
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
#
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
>>> model.eval()
#
>>> model.eval()
# Predict sequence classes logits
#
# Predict sequence classes logits
>>> with torch.no_grad():
#
>>> with torch.no_grad():
lm_logits, mems = model(tokens_tensor)
#
lm_logits, mems = model(tokens_tensor)
"""
#
"""
model
=
XLNetForSequenceClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
#
model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
return
model
#
return model
pytorch_pretrained_bert/modeling_xlnet.py
View file @
f6081f22
...
@@ -1194,6 +1194,38 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
...
@@ -1194,6 +1194,38 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
return
logits
,
new_mems
return
logits
,
new_mems
# return all_attentions, encoded_layers, pooled_output
# return all_attentions, encoded_layers, pooled_output
class
XLNetSequenceSummary
(
nn
.
Module
):
def
__init__
(
self
,
config
,
summary_type
=
"last"
,
use_proj
=
True
,
output_attentions
=
False
,
keep_multihead_output
=
False
):
super
(
XLNetSequenceSummary
,
self
).
__init__
()
self
.
summary_type
=
summary_type
if
use_proj
:
self
.
summary
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
else
:
self
.
summary
=
None
if
summary_type
==
'attn'
:
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise
NotImplementedError
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout
)
self
.
activation
=
nn
.
Tanh
()
def
forward
(
self
,
hidden_states
,
input_mask
=
None
):
if
self
.
summary_type
==
'last'
:
output
=
hidden_states
[
-
1
]
elif
self
.
summary_type
==
'first'
:
output
=
hidden_states
[
0
]
elif
self
.
summary_type
==
'mean'
:
output
=
hidden_states
.
mean
(
dim
=
0
)
elif
summary_type
==
'attn'
:
raise
NotImplementedError
output
=
self
.
summary
(
output
)
output
=
self
.
dropout
(
output
)
output
=
self
.
activation
(
output
)
return
output
class
XLNetForSequenceClassification
(
XLNetPreTrainedModel
):
class
XLNetForSequenceClassification
(
XLNetPreTrainedModel
):
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
...
@@ -1255,19 +1287,23 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
...
@@ -1255,19 +1287,23 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
```
"""
"""
def
__init__
(
self
,
config
,
summary_type
=
"last"
,
output_attentions
=
False
,
keep_multihead_output
=
False
):
def
__init__
(
self
,
config
,
summary_type
=
"last"
,
use_proj
=
True
,
num_labels
=
2
,
is_regression
=
False
,
output_attentions
=
False
,
keep_multihead_output
=
False
):
super
(
XLNetForSequenceClassification
,
self
).
__init__
(
config
)
super
(
XLNetForSequenceClassification
,
self
).
__init__
(
config
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
attn_type
=
config
.
attn_type
self
.
attn_type
=
config
.
attn_type
self
.
same_length
=
config
.
same_length
self
.
same_length
=
config
.
same_length
self
.
summary_type
=
summary_type
self
.
summary_type
=
summary_type
self
.
is_regression
=
is_regression
self
.
transformer
=
XLNetModel
(
config
,
output_attentions
=
output_attentions
,
self
.
transformer
=
XLNetModel
(
config
,
output_attentions
=
output_attentions
,
keep_multihead_output
=
keep_multihead_output
)
keep_multihead_output
=
keep_multihead_output
)
self
.
lm_loss
=
nn
.
Linear
(
config
.
d_model
,
config
.
n_token
,
bias
=
True
)
self
.
apply
(
self
.
init_xlnet_weights
)
self
.
sequence_summary
=
XLNetSequenceSummary
(
config
,
summary_type
=
summary_type
,
self
.
tie_weights
()
use_proj
=
use_proj
,
output_attentions
=
output_attentions
,
keep_multihead_output
=
keep_multihead_output
)
self
.
loss_proj
=
nn
.
Linear
(
config
.
d_model
,
num_classes
if
not
is_regression
else
1
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
inp_k
,
seg_id
=
None
,
input_mask
=
None
,
def
forward
(
self
,
inp_k
,
seg_id
=
None
,
input_mask
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
inp_q
=
None
,
mems
=
None
,
perm_mask
=
None
,
target_mapping
=
None
,
inp_q
=
None
,
...
@@ -1295,17 +1331,20 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
...
@@ -1295,17 +1331,20 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
Only used during pretraining for two-stream attention.
Only used during pretraining for two-stream attention.
Set to None during finetuning.
Set to None during finetuning.
"""
"""
output
,
hidden_states
,
new_mems
=
self
.
transformer
(
inp_k
,
seg_id
,
input_mask
,
output
,
_
,
new_mems
=
self
.
transformer
(
inp_k
,
seg_id
,
input_mask
,
mems
,
perm_mask
,
target_mapping
,
inp_q
,
mems
,
perm_mask
,
target_mapping
,
inp_q
,
output_all_encoded_layers
,
head_mask
)
output_all_encoded_layers
,
head_mask
)
logits
=
self
.
lm_loss
(
output
)
output
=
self
.
sequence_summary
(
output
)
logits
=
self
.
loss_proj
(
output
)
if
target
is
not
None
:
if
target
is
not
None
:
# Flatten the tokens
if
self
.
is_regression
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
logits
.
size
(
-
1
)),
loss
=
loss_fct
(
logits
.
view
(
-
1
),
target
.
view
(
-
1
))
target
.
view
(
-
1
))
else
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
loss
=
loss_fct
(
logits
.
view
(
-
1
,
logits
.
size
(
-
1
)),
target
.
view
(
-
1
))
return
loss
,
new_mems
return
loss
,
new_mems
# if self.output_attentions:
# if self.output_attentions:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment