Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a75c64d8
Commit
a75c64d8
authored
Aug 26, 2020
by
Lysandre
Browse files
Black 20 release
parent
e78c1103
Changes
191
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
302 additions
and
101 deletions
+302
-101
examples/movement-pruning/masked_run_glue.py
examples/movement-pruning/masked_run_glue.py
+49
-16
examples/movement-pruning/masked_run_squad.py
examples/movement-pruning/masked_run_squad.py
+24
-12
examples/multiple-choice/utils_multiple_choice.py
examples/multiple-choice/utils_multiple_choice.py
+22
-4
examples/question-answering/run_squad_trainer.py
examples/question-answering/run_squad_trainer.py
+6
-1
examples/question-answering/run_tf_squad.py
examples/question-answering/run_tf_squad.py
+6
-1
examples/seq2seq/bertabs/configuration_bertabs.py
examples/seq2seq/bertabs/configuration_bertabs.py
+1
-1
examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
...eq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+13
-4
examples/seq2seq/bertabs/modeling_bertabs.py
examples/seq2seq/bertabs/modeling_bertabs.py
+52
-16
examples/seq2seq/bertabs/run_summarization.py
examples/seq2seq/bertabs/run_summarization.py
+35
-13
examples/seq2seq/bertabs/test_utils_summarization.py
examples/seq2seq/bertabs/test_utils_summarization.py
+2
-4
examples/seq2seq/bertabs/utils_summarization.py
examples/seq2seq/bertabs/utils_summarization.py
+8
-8
examples/seq2seq/callbacks.py
examples/seq2seq/callbacks.py
+6
-1
examples/seq2seq/distillation.py
examples/seq2seq/distillation.py
+4
-1
examples/seq2seq/test_seq2seq_examples.py
examples/seq2seq/test_seq2seq_examples.py
+17
-5
examples/seq2seq/utils.py
examples/seq2seq/utils.py
+3
-1
examples/text-classification/run_pl_glue.py
examples/text-classification/run_pl_glue.py
+9
-2
examples/text-classification/run_xnli.py
examples/text-classification/run_xnli.py
+5
-1
examples/text-generation/pplm/run_pplm.py
examples/text-generation/pplm/run_pplm.py
+27
-7
examples/text-generation/pplm/run_pplm_discrim_train.py
examples/text-generation/pplm/run_pplm_discrim_train.py
+6
-1
examples/text-generation/run_generation.py
examples/text-generation/run_generation.py
+7
-2
No files found.
examples/movement-pruning/masked_run_glue.py
View file @
a75c64d8
...
@@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
...
@@ -173,7 +173,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# Distributed training (should be after apex fp16 initialization)
# Distributed training (should be after apex fp16 initialization)
if
args
.
local_rank
!=
-
1
:
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
,
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
,
)
)
# Train!
# Train!
...
@@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
...
@@ -217,7 +220,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
tr_loss
,
logging_loss
=
0.0
,
0.0
tr_loss
,
logging_loss
=
0.0
,
0.0
model
.
zero_grad
()
model
.
zero_grad
()
train_iterator
=
trange
(
train_iterator
=
trange
(
epochs_trained
,
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
],
epochs_trained
,
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
],
)
)
set_seed
(
args
)
# Added here for reproductibility
set_seed
(
args
)
# Added here for reproductibility
for
_
in
train_iterator
:
for
_
in
train_iterator
:
...
@@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
...
@@ -280,11 +286,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
attention_mask
=
inputs
[
"attention_mask"
],
attention_mask
=
inputs
[
"attention_mask"
],
)
)
loss_logits
=
F
.
kl_div
(
loss_logits
=
(
F
.
kl_div
(
input
=
F
.
log_softmax
(
logits_stu
/
args
.
temperature
,
dim
=-
1
),
input
=
F
.
log_softmax
(
logits_stu
/
args
.
temperature
,
dim
=-
1
),
target
=
F
.
softmax
(
logits_tea
/
args
.
temperature
,
dim
=-
1
),
target
=
F
.
softmax
(
logits_tea
/
args
.
temperature
,
dim
=-
1
),
reduction
=
"batchmean"
,
reduction
=
"batchmean"
,
)
*
(
args
.
temperature
**
2
)
)
*
(
args
.
temperature
**
2
)
)
loss
=
args
.
alpha_distil
*
loss_logits
+
args
.
alpha_ce
*
loss
loss
=
args
.
alpha_distil
*
loss_logits
+
args
.
alpha_ce
*
loss
...
@@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -529,7 +538,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
)
)
features
=
convert_examples_to_features
(
features
=
convert_examples_to_features
(
examples
,
tokenizer
,
max_length
=
args
.
max_seq_length
,
label_list
=
label_list
,
output_mode
=
output_mode
,
examples
,
tokenizer
,
max_length
=
args
.
max_seq_length
,
label_list
=
label_list
,
output_mode
=
output_mode
,
)
)
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
@@ -592,7 +605,10 @@ def main():
...
@@ -592,7 +605,10 @@ def main():
)
)
# Other parameters
# Other parameters
parser
.
add_argument
(
parser
.
add_argument
(
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
,
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--tokenizer_name"
,
"--tokenizer_name"
,
...
@@ -616,17 +632,27 @@ def main():
...
@@ -616,17 +632,27 @@ def main():
parser
.
add_argument
(
"--do_train"
,
action
=
"store_true"
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_train"
,
action
=
"store_true"
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
"store_true"
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
"store_true"
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--evaluate_during_training"
,
action
=
"store_true"
,
help
=
"Run evaluation during training at each logging step."
,
"--evaluate_during_training"
,
action
=
"store_true"
,
help
=
"Run evaluation during training at each logging step."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--do_lower_case"
,
action
=
"store_true"
,
help
=
"Set this flag if you are using an uncased model."
,
"--do_lower_case"
,
action
=
"store_true"
,
help
=
"Set this flag if you are using an uncased model."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
,
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--per_gpu_eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for evaluation."
,
"--per_gpu_eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for evaluation."
,
)
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
...
@@ -723,7 +749,10 @@ def main():
...
@@ -723,7 +749,10 @@ def main():
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
,
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--max_steps"
,
"--max_steps"
,
...
@@ -742,10 +771,14 @@ def main():
...
@@ -742,10 +771,14 @@ def main():
)
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
"store_true"
,
help
=
"Avoid using CUDA when available"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
"store_true"
,
help
=
"Avoid using CUDA when available"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--overwrite_output_dir"
,
action
=
"store_true"
,
help
=
"Overwrite the content of the output directory"
,
"--overwrite_output_dir"
,
action
=
"store_true"
,
help
=
"Overwrite the content of the output directory"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--overwrite_cache"
,
action
=
"store_true"
,
help
=
"Overwrite the cached training and evaluation sets"
,
"--overwrite_cache"
,
action
=
"store_true"
,
help
=
"Overwrite the cached training and evaluation sets"
,
)
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
...
...
examples/movement-pruning/masked_run_squad.py
View file @
a75c64d8
...
@@ -181,7 +181,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
...
@@ -181,7 +181,10 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# Distributed training (should be after apex fp16 initialization)
# Distributed training (should be after apex fp16 initialization)
if
args
.
local_rank
!=
-
1
:
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
,
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
,
)
)
# Train!
# Train!
...
@@ -304,16 +307,22 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
...
@@ -304,16 +307,22 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
attention_mask
=
inputs
[
"attention_mask"
],
attention_mask
=
inputs
[
"attention_mask"
],
)
)
loss_start
=
F
.
kl_div
(
loss_start
=
(
F
.
kl_div
(
input
=
F
.
log_softmax
(
start_logits_stu
/
args
.
temperature
,
dim
=-
1
),
input
=
F
.
log_softmax
(
start_logits_stu
/
args
.
temperature
,
dim
=-
1
),
target
=
F
.
softmax
(
start_logits_tea
/
args
.
temperature
,
dim
=-
1
),
target
=
F
.
softmax
(
start_logits_tea
/
args
.
temperature
,
dim
=-
1
),
reduction
=
"batchmean"
,
reduction
=
"batchmean"
,
)
*
(
args
.
temperature
**
2
)
)
loss_end
=
F
.
kl_div
(
*
(
args
.
temperature
**
2
)
)
loss_end
=
(
F
.
kl_div
(
input
=
F
.
log_softmax
(
end_logits_stu
/
args
.
temperature
,
dim
=-
1
),
input
=
F
.
log_softmax
(
end_logits_stu
/
args
.
temperature
,
dim
=-
1
),
target
=
F
.
softmax
(
end_logits_tea
/
args
.
temperature
,
dim
=-
1
),
target
=
F
.
softmax
(
end_logits_tea
/
args
.
temperature
,
dim
=-
1
),
reduction
=
"batchmean"
,
reduction
=
"batchmean"
,
)
*
(
args
.
temperature
**
2
)
)
*
(
args
.
temperature
**
2
)
)
loss_logits
=
(
loss_start
+
loss_end
)
/
2.0
loss_logits
=
(
loss_start
+
loss_end
)
/
2.0
loss
=
args
.
alpha_distil
*
loss_logits
+
args
.
alpha_ce
*
loss
loss
=
args
.
alpha_distil
*
loss_logits
+
args
.
alpha_ce
*
loss
...
@@ -859,7 +868,10 @@ def main():
...
@@ -859,7 +868,10 @@ def main():
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
,
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--max_steps"
,
"--max_steps"
,
...
...
examples/multiple-choice/utils_multiple_choice.py
View file @
a75c64d8
...
@@ -100,7 +100,12 @@ if is_torch_available():
...
@@ -100,7 +100,12 @@ if is_torch_available():
cached_features_file
=
os
.
path
.
join
(
cached_features_file
=
os
.
path
.
join
(
data_dir
,
data_dir
,
"cached_{}_{}_{}_{}"
.
format
(
mode
.
value
,
tokenizer
.
__class__
.
__name__
,
str
(
max_seq_length
),
task
,),
"cached_{}_{}_{}_{}"
.
format
(
mode
.
value
,
tokenizer
.
__class__
.
__name__
,
str
(
max_seq_length
),
task
,
),
)
)
# Make sure only the first process in distributed training processes the dataset,
# Make sure only the first process in distributed training processes the dataset,
...
@@ -121,7 +126,12 @@ if is_torch_available():
...
@@ -121,7 +126,12 @@ if is_torch_available():
else
:
else
:
examples
=
processor
.
get_train_examples
(
data_dir
)
examples
=
processor
.
get_train_examples
(
data_dir
)
logger
.
info
(
"Training examples: %s"
,
len
(
examples
))
logger
.
info
(
"Training examples: %s"
,
len
(
examples
))
self
.
features
=
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,)
self
.
features
=
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
torch
.
save
(
self
.
features
,
cached_features_file
)
torch
.
save
(
self
.
features
,
cached_features_file
)
...
@@ -164,7 +174,12 @@ if is_tf_available():
...
@@ -164,7 +174,12 @@ if is_tf_available():
examples
=
processor
.
get_train_examples
(
data_dir
)
examples
=
processor
.
get_train_examples
(
data_dir
)
logger
.
info
(
"Training examples: %s"
,
len
(
examples
))
logger
.
info
(
"Training examples: %s"
,
len
(
examples
))
self
.
features
=
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,)
self
.
features
=
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
)
def
gen
():
def
gen
():
for
(
ex_index
,
ex
)
in
tqdm
.
tqdm
(
enumerate
(
self
.
features
),
desc
=
"convert examples to features"
):
for
(
ex_index
,
ex
)
in
tqdm
.
tqdm
(
enumerate
(
self
.
features
),
desc
=
"convert examples to features"
):
...
@@ -491,7 +506,10 @@ class ArcProcessor(DataProcessor):
...
@@ -491,7 +506,10 @@ class ArcProcessor(DataProcessor):
def
convert_examples_to_features
(
def
convert_examples_to_features
(
examples
:
List
[
InputExample
],
label_list
:
List
[
str
],
max_length
:
int
,
tokenizer
:
PreTrainedTokenizer
,
examples
:
List
[
InputExample
],
label_list
:
List
[
str
],
max_length
:
int
,
tokenizer
:
PreTrainedTokenizer
,
)
->
List
[
InputFeatures
]:
)
->
List
[
InputFeatures
]:
"""
"""
Loads a data file into a list of `InputFeatures`
Loads a data file into a list of `InputFeatures`
...
...
examples/question-answering/run_squad_trainer.py
View file @
a75c64d8
...
@@ -137,7 +137,12 @@ def main():
...
@@ -137,7 +137,12 @@ def main():
)
)
# Initialize our Trainer
# Initialize our Trainer
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
eval_dataset
,)
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
eval_dataset
,
)
# Training
# Training
if
training_args
.
do_train
:
if
training_args
.
do_train
:
...
...
examples/question-answering/run_tf_squad.py
View file @
a75c64d8
...
@@ -231,7 +231,12 @@ def main():
...
@@ -231,7 +231,12 @@ def main():
eval_dataset
=
eval_dataset
.
apply
(
tf
.
data
.
experimental
.
assert_cardinality
(
len
(
eval_examples
)))
eval_dataset
=
eval_dataset
.
apply
(
tf
.
data
.
experimental
.
assert_cardinality
(
len
(
eval_examples
)))
# Initialize our Trainer
# Initialize our Trainer
trainer
=
TFTrainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
eval_dataset
,)
trainer
=
TFTrainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
eval_dataset
,
)
# Training
# Training
if
training_args
.
do_train
:
if
training_args
.
do_train
:
...
...
examples/seq2seq/bertabs/configuration_bertabs.py
View file @
a75c64d8
...
@@ -28,7 +28,7 @@ BERTABS_FINETUNED_CONFIG_MAP = {
...
@@ -28,7 +28,7 @@ BERTABS_FINETUNED_CONFIG_MAP = {
class
BertAbsConfig
(
PretrainedConfig
):
class
BertAbsConfig
(
PretrainedConfig
):
r
"""
Class to store the configuration of the BertAbs model.
r
"""Class to store the configuration of the BertAbs model.
Arguments:
Arguments:
vocab_size: int
vocab_size: int
...
...
examples/seq2seq/bertabs/convert_bertabs_original_pytorch_checkpoint.py
View file @
a75c64d8
...
@@ -62,7 +62,7 @@ BertAbsConfig = namedtuple(
...
@@ -62,7 +62,7 @@ BertAbsConfig = namedtuple(
def
convert_bertabs_checkpoints
(
path_to_checkpoints
,
dump_path
):
def
convert_bertabs_checkpoints
(
path_to_checkpoints
,
dump_path
):
"""
Copy/paste and tweak the pre-trained weights provided by the creators
"""Copy/paste and tweak the pre-trained weights provided by the creators
of BertAbs for the internal architecture.
of BertAbs for the internal architecture.
"""
"""
...
@@ -164,13 +164,22 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
...
@@ -164,13 +164,22 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
parser
.
add_argument
(
"--bertabs_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the official PyTorch dump."
,
"--bertabs_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the official PyTorch dump."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--pytorch_dump_folder_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
,
"--pytorch_dump_folder_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
,
)
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
convert_bertabs_checkpoints
(
convert_bertabs_checkpoints
(
args
.
bertabs_checkpoint_path
,
args
.
pytorch_dump_folder_path
,
args
.
bertabs_checkpoint_path
,
args
.
pytorch_dump_folder_path
,
)
)
examples/seq2seq/bertabs/modeling_bertabs.py
View file @
a75c64d8
...
@@ -105,10 +105,17 @@ class BertAbs(BertAbsPreTrainedModel):
...
@@ -105,10 +105,17 @@ class BertAbs(BertAbsPreTrainedModel):
p
.
data
.
zero_
()
p
.
data
.
zero_
()
def
forward
(
def
forward
(
self
,
encoder_input_ids
,
decoder_input_ids
,
token_type_ids
,
encoder_attention_mask
,
decoder_attention_mask
,
self
,
encoder_input_ids
,
decoder_input_ids
,
token_type_ids
,
encoder_attention_mask
,
decoder_attention_mask
,
):
):
encoder_output
=
self
.
bert
(
encoder_output
=
self
.
bert
(
input_ids
=
encoder_input_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
encoder_attention_mask
,
input_ids
=
encoder_input_ids
,
token_type_ids
=
token_type_ids
,
attention_mask
=
encoder_attention_mask
,
)
)
encoder_hidden_states
=
encoder_output
[
0
]
encoder_hidden_states
=
encoder_output
[
0
]
dec_state
=
self
.
decoder
.
init_decoder_state
(
encoder_input_ids
,
encoder_hidden_states
)
dec_state
=
self
.
decoder
.
init_decoder_state
(
encoder_input_ids
,
encoder_hidden_states
)
...
@@ -117,8 +124,7 @@ class BertAbs(BertAbsPreTrainedModel):
...
@@ -117,8 +124,7 @@ class BertAbs(BertAbsPreTrainedModel):
class
Bert
(
nn
.
Module
):
class
Bert
(
nn
.
Module
):
""" This class is not really necessary and should probably disappear.
"""This class is not really necessary and should probably disappear."""
"""
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
...
@@ -307,7 +313,14 @@ class TransformerDecoderLayer(nn.Module):
...
@@ -307,7 +313,14 @@ class TransformerDecoderLayer(nn.Module):
self
.
register_buffer
(
"mask"
,
mask
)
self
.
register_buffer
(
"mask"
,
mask
)
def
forward
(
def
forward
(
self
,
inputs
,
memory_bank
,
src_pad_mask
,
tgt_pad_mask
,
previous_input
=
None
,
layer_cache
=
None
,
step
=
None
,
self
,
inputs
,
memory_bank
,
src_pad_mask
,
tgt_pad_mask
,
previous_input
=
None
,
layer_cache
=
None
,
step
=
None
,
):
):
"""
"""
Args:
Args:
...
@@ -331,13 +344,25 @@ class TransformerDecoderLayer(nn.Module):
...
@@ -331,13 +344,25 @@ class TransformerDecoderLayer(nn.Module):
all_input
=
torch
.
cat
((
previous_input
,
input_norm
),
dim
=
1
)
all_input
=
torch
.
cat
((
previous_input
,
input_norm
),
dim
=
1
)
dec_mask
=
None
dec_mask
=
None
query
=
self
.
self_attn
(
all_input
,
all_input
,
input_norm
,
mask
=
dec_mask
,
layer_cache
=
layer_cache
,
type
=
"self"
,)
query
=
self
.
self_attn
(
all_input
,
all_input
,
input_norm
,
mask
=
dec_mask
,
layer_cache
=
layer_cache
,
type
=
"self"
,
)
query
=
self
.
drop
(
query
)
+
inputs
query
=
self
.
drop
(
query
)
+
inputs
query_norm
=
self
.
layer_norm_2
(
query
)
query_norm
=
self
.
layer_norm_2
(
query
)
mid
=
self
.
context_attn
(
mid
=
self
.
context_attn
(
memory_bank
,
memory_bank
,
query_norm
,
mask
=
src_pad_mask
,
layer_cache
=
layer_cache
,
type
=
"context"
,
memory_bank
,
memory_bank
,
query_norm
,
mask
=
src_pad_mask
,
layer_cache
=
layer_cache
,
type
=
"context"
,
)
)
output
=
self
.
feed_forward
(
self
.
drop
(
mid
)
+
query
)
output
=
self
.
feed_forward
(
self
.
drop
(
mid
)
+
query
)
...
@@ -422,7 +447,14 @@ class MultiHeadedAttention(nn.Module):
...
@@ -422,7 +447,14 @@ class MultiHeadedAttention(nn.Module):
self
.
final_linear
=
nn
.
Linear
(
model_dim
,
model_dim
)
self
.
final_linear
=
nn
.
Linear
(
model_dim
,
model_dim
)
def
forward
(
def
forward
(
self
,
key
,
value
,
query
,
mask
=
None
,
layer_cache
=
None
,
type
=
None
,
predefined_graph_1
=
None
,
self
,
key
,
value
,
query
,
mask
=
None
,
layer_cache
=
None
,
type
=
None
,
predefined_graph_1
=
None
,
):
):
"""
"""
Compute the context vector and the attention vectors.
Compute the context vector and the attention vectors.
...
@@ -628,7 +660,7 @@ def gelu(x):
...
@@ -628,7 +660,7 @@ def gelu(x):
class
PositionwiseFeedForward
(
nn
.
Module
):
class
PositionwiseFeedForward
(
nn
.
Module
):
"""
A two-layer Feed-Forward-Network with residual layer norm.
"""A two-layer Feed-Forward-Network with residual layer norm.
Args:
Args:
d_model (int): the size of input for the first-layer of the FFN.
d_model (int): the size of input for the first-layer of the FFN.
...
@@ -770,8 +802,7 @@ class Translator(object):
...
@@ -770,8 +802,7 @@ class Translator(object):
self
.
max_length
=
args
.
max_length
self
.
max_length
=
args
.
max_length
def
translate
(
self
,
batch
,
step
,
attn_debug
=
False
):
def
translate
(
self
,
batch
,
step
,
attn_debug
=
False
):
""" Generates summaries from one batch of data.
"""Generates summaries from one batch of data."""
"""
self
.
model
.
eval
()
self
.
model
.
eval
()
with
torch
.
no_grad
():
with
torch
.
no_grad
():
batch_data
=
self
.
translate_batch
(
batch
)
batch_data
=
self
.
translate_batch
(
batch
)
...
@@ -798,8 +829,7 @@ class Translator(object):
...
@@ -798,8 +829,7 @@ class Translator(object):
# Where the beam search lives
# Where the beam search lives
# I have no idea why it is being called from the method above
# I have no idea why it is being called from the method above
def
_fast_translate_batch
(
self
,
batch
,
max_length
,
min_length
=
0
):
def
_fast_translate_batch
(
self
,
batch
,
max_length
,
min_length
=
0
):
""" Beam Search using the encoder inputs contained in `batch`.
"""Beam Search using the encoder inputs contained in `batch`."""
"""
# The batch object is funny
# The batch object is funny
# Instead of just looking at the size of the arguments we encapsulate
# Instead of just looking at the size of the arguments we encapsulate
...
@@ -981,7 +1011,7 @@ def tile(x, count, dim=0):
...
@@ -981,7 +1011,7 @@ def tile(x, count, dim=0):
class
BertSumOptimizer
(
object
):
class
BertSumOptimizer
(
object
):
"""
Specific optimizer for BertSum.
"""Specific optimizer for BertSum.
As described in [1], the authors fine-tune BertSum for abstractive
As described in [1], the authors fine-tune BertSum for abstractive
summarization using two Adam Optimizers with different warm-up steps and
summarization using two Adam Optimizers with different warm-up steps and
...
@@ -999,10 +1029,16 @@ class BertSumOptimizer(object):
...
@@ -999,10 +1029,16 @@ class BertSumOptimizer(object):
self
.
optimizers
=
{
self
.
optimizers
=
{
"encoder"
:
torch
.
optim
.
Adam
(
"encoder"
:
torch
.
optim
.
Adam
(
model
.
encoder
.
parameters
(),
lr
=
lr
[
"encoder"
],
betas
=
(
beta_1
,
beta_2
),
eps
=
eps
,
model
.
encoder
.
parameters
(),
lr
=
lr
[
"encoder"
],
betas
=
(
beta_1
,
beta_2
),
eps
=
eps
,
),
),
"decoder"
:
torch
.
optim
.
Adam
(
"decoder"
:
torch
.
optim
.
Adam
(
model
.
decoder
.
parameters
(),
lr
=
lr
[
"decoder"
],
betas
=
(
beta_1
,
beta_2
),
eps
=
eps
,
model
.
decoder
.
parameters
(),
lr
=
lr
[
"decoder"
],
betas
=
(
beta_1
,
beta_2
),
eps
=
eps
,
),
),
}
}
...
...
examples/seq2seq/bertabs/run_summarization.py
View file @
a75c64d8
...
@@ -99,7 +99,7 @@ def evaluate(args):
...
@@ -99,7 +99,7 @@ def evaluate(args):
def
save_summaries
(
summaries
,
path
,
original_document_name
):
def
save_summaries
(
summaries
,
path
,
original_document_name
):
"""
Write the summaries in fies that are prefixed by the original
"""Write the summaries in fies that are prefixed by the original
files' name with the `_summary` appended.
files' name with the `_summary` appended.
Attributes:
Attributes:
...
@@ -125,7 +125,7 @@ def save_summaries(summaries, path, original_document_name):
...
@@ -125,7 +125,7 @@ def save_summaries(summaries, path, original_document_name):
def
format_summary
(
translation
):
def
format_summary
(
translation
):
"""
Transforms the output of the `from_batch` function
"""Transforms the output of the `from_batch` function
into nicely formatted summaries.
into nicely formatted summaries.
"""
"""
raw_summary
,
_
,
_
=
translation
raw_summary
,
_
,
_
=
translation
...
@@ -190,7 +190,12 @@ def build_data_iterator(args, tokenizer):
...
@@ -190,7 +190,12 @@ def build_data_iterator(args, tokenizer):
def
collate_fn
(
data
):
def
collate_fn
(
data
):
return
collate
(
data
,
tokenizer
,
block_size
=
512
,
device
=
args
.
device
)
return
collate
(
data
,
tokenizer
,
block_size
=
512
,
device
=
args
.
device
)
iterator
=
DataLoader
(
dataset
,
sampler
=
sampler
,
batch_size
=
args
.
batch_size
,
collate_fn
=
collate_fn
,)
iterator
=
DataLoader
(
dataset
,
sampler
=
sampler
,
batch_size
=
args
.
batch_size
,
collate_fn
=
collate_fn
,
)
return
iterator
return
iterator
...
@@ -201,7 +206,7 @@ def load_and_cache_examples(args, tokenizer):
...
@@ -201,7 +206,7 @@ def load_and_cache_examples(args, tokenizer):
def
collate
(
data
,
tokenizer
,
block_size
,
device
):
def
collate
(
data
,
tokenizer
,
block_size
,
device
):
"""
Collate formats the data passed to the data loader.
"""Collate formats the data passed to the data loader.
In particular we tokenize the data batch after batch to avoid keeping them
In particular we tokenize the data batch after batch to avoid keeping them
all in memory. We output the data as a namedtuple to fit the original BertAbs's
all in memory. We output the data as a namedtuple to fit the original BertAbs's
...
@@ -231,7 +236,7 @@ def collate(data, tokenizer, block_size, device):
...
@@ -231,7 +236,7 @@ def collate(data, tokenizer, block_size, device):
def
decode_summary
(
summary_tokens
,
tokenizer
):
def
decode_summary
(
summary_tokens
,
tokenizer
):
"""
Decode the summary and return it in a format
"""Decode the summary and return it in a format
suitable for evaluation.
suitable for evaluation.
"""
"""
summary_tokens
=
summary_tokens
.
to
(
"cpu"
).
numpy
()
summary_tokens
=
summary_tokens
.
to
(
"cpu"
).
numpy
()
...
@@ -242,8 +247,7 @@ def decode_summary(summary_tokens, tokenizer):
...
@@ -242,8 +247,7 @@ def decode_summary(summary_tokens, tokenizer):
def
main
():
def
main
():
""" The main function defines the interface with the users.
"""The main function defines the interface with the users."""
"""
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
parser
.
add_argument
(
"--documents_dir"
,
"--documents_dir"
,
...
@@ -268,23 +272,41 @@ def main():
...
@@ -268,23 +272,41 @@ def main():
)
)
# EVALUATION options
# EVALUATION options
parser
.
add_argument
(
parser
.
add_argument
(
"--no_cuda"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to force the execution on CPU."
,
"--no_cuda"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to force the execution on CPU."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--batch_size"
,
default
=
4
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
,
"--batch_size"
,
default
=
4
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
,
)
)
# BEAM SEARCH arguments
# BEAM SEARCH arguments
parser
.
add_argument
(
parser
.
add_argument
(
"--min_length"
,
default
=
50
,
type
=
int
,
help
=
"Minimum number of tokens for the summaries."
,
"--min_length"
,
default
=
50
,
type
=
int
,
help
=
"Minimum number of tokens for the summaries."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--max_length"
,
default
=
200
,
type
=
int
,
help
=
"Maixmum number of tokens for the summaries."
,
"--max_length"
,
default
=
200
,
type
=
int
,
help
=
"Maixmum number of tokens for the summaries."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--beam_size"
,
default
=
5
,
type
=
int
,
help
=
"The number of beams to start with for each example."
,
"--beam_size"
,
default
=
5
,
type
=
int
,
help
=
"The number of beams to start with for each example."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--alpha"
,
default
=
0.95
,
type
=
float
,
help
=
"The value of alpha for the length penalty in the beam search."
,
"--alpha"
,
default
=
0.95
,
type
=
float
,
help
=
"The value of alpha for the length penalty in the beam search."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--block_trigram"
,
"--block_trigram"
,
...
...
examples/seq2seq/bertabs/test_utils_summarization.py
View file @
a75c64d8
...
@@ -43,8 +43,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
...
@@ -43,8 +43,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
self
.
assertEqual
(
truncate_or_pad
(
sequence
,
self
.
block_size
,
0
),
expected_output
)
self
.
assertEqual
(
truncate_or_pad
(
sequence
,
self
.
block_size
,
0
),
expected_output
)
def
test_process_story_no_highlights
(
self
):
def
test_process_story_no_highlights
(
self
):
""" Processing a story with no highlights returns an empty list for the summary.
"""Processing a story with no highlights returns an empty list for the summary."""
"""
raw_story
=
"""It was the year of Our Lord one thousand seven hundred and
raw_story
=
"""It was the year of Our Lord one thousand seven hundred and
seventy-five.
\n\n
Spiritual revelations were conceded to England at that
seventy-five.
\n\n
Spiritual revelations were conceded to England at that
favoured period, as at this."""
favoured period, as at this."""
...
@@ -52,8 +51,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
...
@@ -52,8 +51,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
self
.
assertEqual
(
summary_lines
,
[])
self
.
assertEqual
(
summary_lines
,
[])
def
test_process_empty_story
(
self
):
def
test_process_empty_story
(
self
):
""" An empty story returns an empty collection of lines.
"""An empty story returns an empty collection of lines."""
"""
raw_story
=
""
raw_story
=
""
story_lines
,
summary_lines
=
process_story
(
raw_story
)
story_lines
,
summary_lines
=
process_story
(
raw_story
)
self
.
assertEqual
(
story_lines
,
[])
self
.
assertEqual
(
story_lines
,
[])
...
...
examples/seq2seq/bertabs/utils_summarization.py
View file @
a75c64d8
...
@@ -11,7 +11,7 @@ from torch.utils.data import Dataset
...
@@ -11,7 +11,7 @@ from torch.utils.data import Dataset
class
CNNDMDataset
(
Dataset
):
class
CNNDMDataset
(
Dataset
):
"""
Abstracts the dataset used to train seq2seq models.
"""Abstracts the dataset used to train seq2seq models.
The class will process the documents that are located in the specified
The class will process the documents that are located in the specified
folder. The preprocessing will work on any document that is reasonably
folder. The preprocessing will work on any document that is reasonably
...
@@ -31,7 +31,7 @@ class CNNDMDataset(Dataset):
...
@@ -31,7 +31,7 @@ class CNNDMDataset(Dataset):
"""
"""
def
__init__
(
self
,
path
=
""
,
prefix
=
"train"
):
def
__init__
(
self
,
path
=
""
,
prefix
=
"train"
):
"""
We initialize the class by listing all the documents to summarize.
"""We initialize the class by listing all the documents to summarize.
Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
"""
"""
assert
os
.
path
.
isdir
(
path
)
assert
os
.
path
.
isdir
(
path
)
...
@@ -60,7 +60,7 @@ class CNNDMDataset(Dataset):
...
@@ -60,7 +60,7 @@ class CNNDMDataset(Dataset):
def
process_story
(
raw_story
):
def
process_story
(
raw_story
):
"""
Extract the story and summary from a story file.
"""Extract the story and summary from a story file.
Arguments:
Arguments:
raw_story (str): content of the story file as an utf-8 encoded string.
raw_story (str): content of the story file as an utf-8 encoded string.
...
@@ -108,7 +108,7 @@ def _add_missing_period(line):
...
@@ -108,7 +108,7 @@ def _add_missing_period(line):
def
truncate_or_pad
(
sequence
,
block_size
,
pad_token_id
):
def
truncate_or_pad
(
sequence
,
block_size
,
pad_token_id
):
"""
Adapt the source and target sequences' lengths to the block size.
"""Adapt the source and target sequences' lengths to the block size.
If the sequence is shorter we append padding token to the right of the sequence.
If the sequence is shorter we append padding token to the right of the sequence.
"""
"""
if
len
(
sequence
)
>
block_size
:
if
len
(
sequence
)
>
block_size
:
...
@@ -119,8 +119,8 @@ def truncate_or_pad(sequence, block_size, pad_token_id):
...
@@ -119,8 +119,8 @@ def truncate_or_pad(sequence, block_size, pad_token_id):
def
build_mask
(
sequence
,
pad_token_id
):
def
build_mask
(
sequence
,
pad_token_id
):
"""
Builds the mask. The attention mechanism will only attend to positions
"""Builds the mask. The attention mechanism will only attend to positions
with value 1.
"""
with value 1."""
mask
=
torch
.
ones_like
(
sequence
)
mask
=
torch
.
ones_like
(
sequence
)
idx_pad_tokens
=
sequence
==
pad_token_id
idx_pad_tokens
=
sequence
==
pad_token_id
mask
[
idx_pad_tokens
]
=
0
mask
[
idx_pad_tokens
]
=
0
...
@@ -128,7 +128,7 @@ def build_mask(sequence, pad_token_id):
...
@@ -128,7 +128,7 @@ def build_mask(sequence, pad_token_id):
def
encode_for_summarization
(
story_lines
,
summary_lines
,
tokenizer
):
def
encode_for_summarization
(
story_lines
,
summary_lines
,
tokenizer
):
"""
Encode the story and summary lines, and join them
"""Encode the story and summary lines, and join them
as specified in [1] by using `[SEP] [CLS]` tokens to separate
as specified in [1] by using `[SEP] [CLS]` tokens to separate
sentences.
sentences.
"""
"""
...
@@ -141,7 +141,7 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
...
@@ -141,7 +141,7 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer):
def
compute_token_type_ids
(
batch
,
separator_token_id
):
def
compute_token_type_ids
(
batch
,
separator_token_id
):
"""
Segment embeddings as described in [1]
"""Segment embeddings as described in [1]
The values {0,1} were found in the repository [2].
The values {0,1} were found in the repository [2].
...
...
examples/seq2seq/callbacks.py
View file @
a75c64d8
...
@@ -97,4 +97,9 @@ def get_checkpoint_callback(output_dir, metric):
...
@@ -97,4 +97,9 @@ def get_checkpoint_callback(output_dir, metric):
def
get_early_stopping_callback
(
metric
,
patience
):
def
get_early_stopping_callback
(
metric
,
patience
):
return
EarlyStopping
(
monitor
=
f
"val_
{
metric
}
"
,
mode
=
"max"
,
patience
=
patience
,
verbose
=
True
,)
return
EarlyStopping
(
monitor
=
f
"val_
{
metric
}
"
,
mode
=
"max"
,
patience
=
patience
,
verbose
=
True
,
)
examples/seq2seq/distillation.py
View file @
a75c64d8
...
@@ -348,7 +348,10 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
...
@@ -348,7 +348,10 @@ class T5SummarizationDistiller(BartSummarizationDistiller):
if
self
.
different_encoder
:
if
self
.
different_encoder
:
with
torch
.
no_grad
():
with
torch
.
no_grad
():
teacher_enc_outputs
,
teacher_enc_hid
=
self
.
teacher
.
encoder
(
teacher_enc_outputs
,
teacher_enc_hid
=
self
.
teacher
.
encoder
(
source_ids
,
attention_mask
=
source_mask
,
output_hidden_states
=
True
,
use_cache
=
False
,
source_ids
,
attention_mask
=
source_mask
,
output_hidden_states
=
True
,
use_cache
=
False
,
)
)
if
self
.
hparams
.
alpha_encoder_loss
>
0
:
if
self
.
hparams
.
alpha_encoder_loss
>
0
:
loss_encoder
=
self
.
calc_mse_loss
(
enc_outputs
,
teacher_enc_outputs
,
source_mask
)
loss_encoder
=
self
.
calc_mse_loss
(
enc_outputs
,
teacher_enc_outputs
,
source_mask
)
...
...
examples/seq2seq/test_seq2seq_examples.py
View file @
a75c64d8
...
@@ -117,7 +117,12 @@ class TestSummarizationDistiller(unittest.TestCase):
...
@@ -117,7 +117,12 @@ class TestSummarizationDistiller(unittest.TestCase):
@
require_multigpu
@
require_multigpu
def
test_multigpu
(
self
):
def
test_multigpu
(
self
):
updates
=
dict
(
no_teacher
=
True
,
freeze_encoder
=
True
,
gpus
=
2
,
sortish_sampler
=
False
,)
updates
=
dict
(
no_teacher
=
True
,
freeze_encoder
=
True
,
gpus
=
2
,
sortish_sampler
=
False
,
)
self
.
_test_distiller_cli
(
updates
)
self
.
_test_distiller_cli
(
updates
)
def
test_distill_no_teacher
(
self
):
def
test_distill_no_teacher
(
self
):
...
@@ -261,7 +266,8 @@ def test_run_eval_bart(model):
...
@@ -261,7 +266,8 @@ def test_run_eval_bart(model):
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
[
"model"
],
[
pytest
.
param
(
T5_TINY
),
pytest
.
param
(
BART_TINY
),
pytest
.
param
(
MBART_TINY
),
pytest
.
param
(
MARIAN_TINY
)],
[
"model"
],
[
pytest
.
param
(
T5_TINY
),
pytest
.
param
(
BART_TINY
),
pytest
.
param
(
MBART_TINY
),
pytest
.
param
(
MARIAN_TINY
)],
)
)
def
test_finetune
(
model
):
def
test_finetune
(
model
):
args_d
:
dict
=
CHEAP_ARGS
.
copy
()
args_d
:
dict
=
CHEAP_ARGS
.
copy
()
...
@@ -329,7 +335,8 @@ def test_finetune_extra_model_args():
...
@@ -329,7 +335,8 @@ def test_finetune_extra_model_args():
output_dir
=
tempfile
.
mkdtemp
(
prefix
=
"output_1_"
)
output_dir
=
tempfile
.
mkdtemp
(
prefix
=
"output_1_"
)
args_d1
=
args_d
.
copy
()
args_d1
=
args_d
.
copy
()
args_d1
.
update
(
args_d1
.
update
(
model_name_or_path
=
model
,
output_dir
=
output_dir
,
model_name_or_path
=
model
,
output_dir
=
output_dir
,
)
)
extra_model_params
=
(
"encoder_layerdrop"
,
"decoder_layerdrop"
,
"dropout"
,
"attention_dropout"
)
extra_model_params
=
(
"encoder_layerdrop"
,
"decoder_layerdrop"
,
"dropout"
,
"attention_dropout"
)
for
p
in
extra_model_params
:
for
p
in
extra_model_params
:
...
@@ -344,7 +351,8 @@ def test_finetune_extra_model_args():
...
@@ -344,7 +351,8 @@ def test_finetune_extra_model_args():
output_dir
=
tempfile
.
mkdtemp
(
prefix
=
"output_2_"
)
output_dir
=
tempfile
.
mkdtemp
(
prefix
=
"output_2_"
)
args_d2
=
args_d
.
copy
()
args_d2
=
args_d
.
copy
()
args_d2
.
update
(
args_d2
.
update
(
model_name_or_path
=
model
,
output_dir
=
output_dir
,
model_name_or_path
=
model
,
output_dir
=
output_dir
,
)
)
unsupported_param
=
"encoder_layerdrop"
unsupported_param
=
"encoder_layerdrop"
args_d2
[
unsupported_param
]
=
0.5
args_d2
[
unsupported_param
]
=
0.5
...
@@ -478,7 +486,11 @@ def test_summarization_dataset_truncation(tok):
...
@@ -478,7 +486,11 @@ def test_summarization_dataset_truncation(tok):
max_len_target
=
max
(
len
(
tokenizer
.
encode
(
a
))
for
a
in
SUMMARIES
)
max_len_target
=
max
(
len
(
tokenizer
.
encode
(
a
))
for
a
in
SUMMARIES
)
trunc_target
=
4
trunc_target
=
4
train_dataset
=
Seq2SeqDataset
(
train_dataset
=
Seq2SeqDataset
(
tokenizer
,
data_dir
=
tmp_dir
,
type_path
=
"train"
,
max_source_length
=
20
,
max_target_length
=
trunc_target
,
tokenizer
,
data_dir
=
tmp_dir
,
type_path
=
"train"
,
max_source_length
=
20
,
max_target_length
=
trunc_target
,
)
)
dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
2
,
collate_fn
=
train_dataset
.
collate_fn
)
dataloader
=
DataLoader
(
train_dataset
,
batch_size
=
2
,
collate_fn
=
train_dataset
.
collate_fn
)
for
batch
in
dataloader
:
for
batch
in
dataloader
:
...
...
examples/seq2seq/utils.py
View file @
a75c64d8
...
@@ -63,7 +63,9 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
...
@@ -63,7 +63,9 @@ def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
def
trim_batch
(
def
trim_batch
(
input_ids
,
pad_token_id
,
attention_mask
=
None
,
input_ids
,
pad_token_id
,
attention_mask
=
None
,
):
):
"""Remove columns that are populated exclusively by pad_token_id"""
"""Remove columns that are populated exclusively by pad_token_id"""
keep_column_mask
=
input_ids
.
ne
(
pad_token_id
).
any
(
dim
=
0
)
keep_column_mask
=
input_ids
.
ne
(
pad_token_id
).
any
(
dim
=
0
)
...
...
examples/text-classification/run_pl_glue.py
View file @
a75c64d8
...
@@ -153,7 +153,11 @@ class GLUETransformer(BaseTransformer):
...
@@ -153,7 +153,11 @@ class GLUETransformer(BaseTransformer):
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--task"
,
default
=
""
,
type
=
str
,
required
=
True
,
help
=
"The GLUE task to run"
,
"--task"
,
default
=
""
,
type
=
str
,
required
=
True
,
help
=
"The GLUE task to run"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--gpus"
,
"--gpus"
,
...
@@ -177,7 +181,10 @@ def main():
...
@@ -177,7 +181,10 @@ def main():
# If output_dir not provided, a folder will be generated in pwd
# If output_dir not provided, a folder will be generated in pwd
if
args
.
output_dir
is
None
:
if
args
.
output_dir
is
None
:
args
.
output_dir
=
os
.
path
.
join
(
"./results"
,
f
"
{
args
.
task
}
_
{
time
.
strftime
(
'%Y%m%d_%H%M%S'
)
}
"
,)
args
.
output_dir
=
os
.
path
.
join
(
"./results"
,
f
"
{
args
.
task
}
_
{
time
.
strftime
(
'%Y%m%d_%H%M%S'
)
}
"
,
)
os
.
makedirs
(
args
.
output_dir
)
os
.
makedirs
(
args
.
output_dir
)
model
=
GLUETransformer
(
args
)
model
=
GLUETransformer
(
args
)
...
...
examples/text-classification/run_xnli.py
View file @
a75c64d8
...
@@ -328,7 +328,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
...
@@ -328,7 +328,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor
.
get_test_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
processor
.
get_test_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
)
)
features
=
convert_examples_to_features
(
features
=
convert_examples_to_features
(
examples
,
tokenizer
,
max_length
=
args
.
max_seq_length
,
label_list
=
label_list
,
output_mode
=
output_mode
,
examples
,
tokenizer
,
max_length
=
args
.
max_seq_length
,
label_list
=
label_list
,
output_mode
=
output_mode
,
)
)
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
...
examples/text-generation/pplm/run_pplm.py
View file @
a75c64d8
...
@@ -698,7 +698,9 @@ def run_pplm_example(
...
@@ -698,7 +698,9 @@ def run_pplm_example(
for
word_id
in
pert_gen_tok_text
.
tolist
()[
0
]:
for
word_id
in
pert_gen_tok_text
.
tolist
()[
0
]:
if
word_id
in
bow_word_ids
:
if
word_id
in
bow_word_ids
:
pert_gen_text
+=
"{}{}{}"
.
format
(
pert_gen_text
+=
"{}{}{}"
.
format
(
colorama
.
Fore
.
RED
,
tokenizer
.
decode
([
word_id
]),
colorama
.
Style
.
RESET_ALL
,
colorama
.
Fore
.
RED
,
tokenizer
.
decode
([
word_id
]),
colorama
.
Style
.
RESET_ALL
,
)
)
else
:
else
:
pert_gen_text
+=
tokenizer
.
decode
([
word_id
])
pert_gen_text
+=
tokenizer
.
decode
([
word_id
])
...
@@ -729,7 +731,10 @@ if __name__ == "__main__":
...
@@ -729,7 +731,10 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--cond_text"
,
type
=
str
,
default
=
"The lake"
,
help
=
"Prefix texts to condition on"
)
parser
.
add_argument
(
"--cond_text"
,
type
=
str
,
default
=
"The lake"
,
help
=
"Prefix texts to condition on"
)
parser
.
add_argument
(
"--uncond"
,
action
=
"store_true"
,
help
=
"Generate from end-of-text as prefix"
)
parser
.
add_argument
(
"--uncond"
,
action
=
"store_true"
,
help
=
"Generate from end-of-text as prefix"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--num_samples"
,
type
=
int
,
default
=
1
,
help
=
"Number of samples to generate from the modified latents"
,
"--num_samples"
,
type
=
int
,
default
=
1
,
help
=
"Number of samples to generate from the modified latents"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--bag_of_words"
,
"--bag_of_words"
,
...
@@ -751,13 +756,22 @@ if __name__ == "__main__":
...
@@ -751,13 +756,22 @@ if __name__ == "__main__":
help
=
"Discriminator to use"
,
help
=
"Discriminator to use"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--discrim_weights"
,
type
=
str
,
default
=
None
,
help
=
"Weights for the generic discriminator"
,
"--discrim_weights"
,
type
=
str
,
default
=
None
,
help
=
"Weights for the generic discriminator"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--discrim_meta"
,
type
=
str
,
default
=
None
,
help
=
"Meta information for the generic discriminator"
,
"--discrim_meta"
,
type
=
str
,
default
=
None
,
help
=
"Meta information for the generic discriminator"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--class_label"
,
type
=
int
,
default
=-
1
,
help
=
"Class label used for the discriminator"
,
"--class_label"
,
type
=
int
,
default
=-
1
,
help
=
"Class label used for the discriminator"
,
)
)
parser
.
add_argument
(
"--length"
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
"--length"
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
"--stepsize"
,
type
=
float
,
default
=
0.02
)
parser
.
add_argument
(
"--stepsize"
,
type
=
float
,
default
=
0.02
)
...
@@ -773,7 +787,10 @@ if __name__ == "__main__":
...
@@ -773,7 +787,10 @@ if __name__ == "__main__":
help
=
"Length of past which is being optimized; 0 corresponds to infinite window length"
,
help
=
"Length of past which is being optimized; 0 corresponds to infinite window length"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--horizon_length"
,
type
=
int
,
default
=
1
,
help
=
"Length of future to optimize over"
,
"--horizon_length"
,
type
=
int
,
default
=
1
,
help
=
"Length of future to optimize over"
,
)
)
parser
.
add_argument
(
"--decay"
,
action
=
"store_true"
,
help
=
"whether to decay or not"
)
parser
.
add_argument
(
"--decay"
,
action
=
"store_true"
,
help
=
"whether to decay or not"
)
parser
.
add_argument
(
"--gamma"
,
type
=
float
,
default
=
1.5
)
parser
.
add_argument
(
"--gamma"
,
type
=
float
,
default
=
1.5
)
...
@@ -783,7 +800,10 @@ if __name__ == "__main__":
...
@@ -783,7 +800,10 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--no_cuda"
,
action
=
"store_true"
,
help
=
"no cuda"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
"store_true"
,
help
=
"no cuda"
)
parser
.
add_argument
(
"--colorama"
,
action
=
"store_true"
,
help
=
"colors keywords"
)
parser
.
add_argument
(
"--colorama"
,
action
=
"store_true"
,
help
=
"colors keywords"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--repetition_penalty"
,
type
=
float
,
default
=
1.0
,
help
=
"Penalize repetition. More than 1.0 -> less repetition"
,
"--repetition_penalty"
,
type
=
float
,
default
=
1.0
,
help
=
"Penalize repetition. More than 1.0 -> less repetition"
,
)
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
...
examples/text-generation/pplm/run_pplm_discrim_train.py
View file @
a75c64d8
...
@@ -242,7 +242,12 @@ def train_discriminator(
...
@@ -242,7 +242,12 @@ def train_discriminator(
text
=
torchtext_data
.
Field
()
text
=
torchtext_data
.
Field
()
label
=
torchtext_data
.
Field
(
sequential
=
False
)
label
=
torchtext_data
.
Field
(
sequential
=
False
)
train_data
,
val_data
,
test_data
=
datasets
.
SST
.
splits
(
text
,
label
,
fine_grained
=
True
,
train_subtrees
=
True
,)
train_data
,
val_data
,
test_data
=
datasets
.
SST
.
splits
(
text
,
label
,
fine_grained
=
True
,
train_subtrees
=
True
,
)
x
=
[]
x
=
[]
y
=
[]
y
=
[]
...
...
examples/text-generation/run_generation.py
View file @
a75c64d8
...
@@ -41,7 +41,9 @@ from transformers import (
...
@@ -41,7 +41,9 @@ from transformers import (
logging
.
basicConfig
(
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
)
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -197,7 +199,10 @@ def main():
...
@@ -197,7 +199,10 @@ def main():
args
.
n_gpu
=
0
if
args
.
no_cuda
else
torch
.
cuda
.
device_count
()
args
.
n_gpu
=
0
if
args
.
no_cuda
else
torch
.
cuda
.
device_count
()
logger
.
warning
(
logger
.
warning
(
"device: %s, n_gpu: %s, 16-bits training: %s"
,
args
.
device
,
args
.
n_gpu
,
args
.
fp16
,
"device: %s, n_gpu: %s, 16-bits training: %s"
,
args
.
device
,
args
.
n_gpu
,
args
.
fp16
,
)
)
set_seed
(
args
)
set_seed
(
args
)
...
...
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment