Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a75c64d8
Commit
a75c64d8
authored
Aug 26, 2020
by
Lysandre
Browse files
Black 20 release
parent
e78c1103
Changes
191
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
525 additions
and
381 deletions
+525
-381
examples/adversarial/utils_hans.py
examples/adversarial/utils_hans.py
+8
-2
examples/benchmarking/plot_csv_file.py
examples/benchmarking/plot_csv_file.py
+7
-3
examples/bert-loses-patience/pabee/modeling_pabee_albert.py
examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+68
-62
examples/bert-loses-patience/pabee/modeling_pabee_bert.py
examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+62
-62
examples/bert-loses-patience/run_glue_with_pabee.py
examples/bert-loses-patience/run_glue_with_pabee.py
+65
-18
examples/bertology/run_bertology.py
examples/bertology/run_bertology.py
+9
-8
examples/contrib/mm-imdb/utils_mmimdb.py
examples/contrib/mm-imdb/utils_mmimdb.py
+4
-1
examples/contrib/run_camembert.py
examples/contrib/run_camembert.py
+5
-1
examples/contrib/run_openai_gpt.py
examples/contrib/run_openai_gpt.py
+7
-4
examples/contrib/run_swag.py
examples/contrib/run_swag.py
+3
-1
examples/deebert/run_glue_deebert.py
examples/deebert/run_glue_deebert.py
+5
-1
examples/deebert/src/modeling_highway_bert.py
examples/deebert/src/modeling_highway_bert.py
+59
-59
examples/deebert/src/modeling_highway_roberta.py
examples/deebert/src/modeling_highway_roberta.py
+28
-27
examples/distillation/run_squad_w_distillation.py
examples/distillation/run_squad_w_distillation.py
+14
-8
examples/distillation/utils.py
examples/distillation/utils.py
+2
-1
examples/language-modeling/run_language_modeling.py
examples/language-modeling/run_language_modeling.py
+3
-1
examples/lightning_base.py
examples/lightning_base.py
+6
-2
examples/longform-qa/eli5_app.py
examples/longform-qa/eli5_app.py
+19
-5
examples/longform-qa/eli5_utils.py
examples/longform-qa/eli5_utils.py
+43
-9
examples/movement-pruning/emmental/modeling_bert_masked.py
examples/movement-pruning/emmental/modeling_bert_masked.py
+108
-106
No files found.
examples/adversarial/utils_hans.py
View file @
a75c64d8
...
...
@@ -112,7 +112,10 @@ if is_torch_available():
cached_features_file
=
os
.
path
.
join
(
data_dir
,
"cached_{}_{}_{}_{}"
.
format
(
"dev"
if
evaluate
else
"train"
,
tokenizer
.
__class__
.
__name__
,
str
(
max_seq_length
),
task
,
"dev"
if
evaluate
else
"train"
,
tokenizer
.
__class__
.
__name__
,
str
(
max_seq_length
),
task
,
),
)
label_list
=
processor
.
get_labels
()
...
...
@@ -278,7 +281,10 @@ class HansProcessor(DataProcessor):
def
hans_convert_examples_to_features
(
examples
:
List
[
InputExample
],
label_list
:
List
[
str
],
max_length
:
int
,
tokenizer
:
PreTrainedTokenizer
,
examples
:
List
[
InputExample
],
label_list
:
List
[
str
],
max_length
:
int
,
tokenizer
:
PreTrainedTokenizer
,
):
"""
Loads a data file into a list of ``InputFeatures``
...
...
examples/benchmarking/plot_csv_file.py
View file @
a75c64d8
...
...
@@ -20,7 +20,9 @@ class PlotArguments:
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
csv_file
:
str
=
field
(
metadata
=
{
"help"
:
"The csv file to plot."
},)
csv_file
:
str
=
field
(
metadata
=
{
"help"
:
"The csv file to plot."
},
)
plot_along_batch
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether to plot along batch size or sequence lengh. Defaults to sequence length."
},
...
...
@@ -30,7 +32,8 @@ class PlotArguments:
metadata
=
{
"help"
:
"Whether the csv file has time results or memory results. Defaults to memory results."
},
)
no_log_scale
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Disable logarithmic scale when plotting"
},
default
=
False
,
metadata
=
{
"help"
:
"Disable logarithmic scale when plotting"
},
)
is_train
:
bool
=
field
(
default
=
False
,
...
...
@@ -39,7 +42,8 @@ class PlotArguments:
},
)
figure_png_file
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Filename under which the plot will be saved. If unused no plot is saved."
},
default
=
None
,
metadata
=
{
"help"
:
"Filename under which the plot will be saved. If unused no plot is saved."
},
)
short_model_names
:
Optional
[
List
[
str
]]
=
list_field
(
default
=
None
,
metadata
=
{
"help"
:
"List of model names that are used instead of the ones in the csv file."
}
...
...
examples/bert-loses-patience/pabee/modeling_pabee_albert.py
View file @
a75c64d8
...
...
@@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel):
res
=
[]
for
i
in
range
(
self
.
config
.
num_hidden_layers
):
encoder_outputs
=
self
.
encoder
.
adaptive_forward
(
encoder_outputs
,
current_layer
=
i
,
attention_mask
=
extended_attention_mask
,
head_mask
=
head_mask
,
encoder_outputs
,
current_layer
=
i
,
attention_mask
=
extended_attention_mask
,
head_mask
=
head_mask
,
)
pooled_output
=
self
.
pooler_activation
(
self
.
pooler
(
encoder_outputs
[
0
][:,
0
]))
...
...
@@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel):
for
i
in
range
(
self
.
config
.
num_hidden_layers
):
calculated_layer_num
+=
1
encoder_outputs
=
self
.
encoder
.
adaptive_forward
(
encoder_outputs
,
current_layer
=
i
,
attention_mask
=
extended_attention_mask
,
head_mask
=
head_mask
,
encoder_outputs
,
current_layer
=
i
,
attention_mask
=
extended_attention_mask
,
head_mask
=
head_mask
,
)
pooled_output
=
self
.
pooler_activation
(
self
.
pooler
(
encoder_outputs
[
0
][:,
0
]))
...
...
examples/bert-loses-patience/pabee/modeling_pabee_bert.py
View file @
a75c64d8
examples/bert-loses-patience/run_glue_with_pabee.py
View file @
a75c64d8
...
...
@@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer):
# Distributed training (should be after apex fp16 initialization)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
,
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
,
)
# Train!
...
...
@@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer):
logger
.
info
(
" Continuing training from epoch %d"
,
epochs_trained
)
logger
.
info
(
" Continuing training from global step %d"
,
global_step
)
logger
.
info
(
" Will skip the first %d steps in the first epoch"
,
steps_trained_in_current_epoch
,
" Will skip the first %d steps in the first epoch"
,
steps_trained_in_current_epoch
,
)
tr_loss
,
logging_loss
=
0.0
,
0.0
model
.
zero_grad
()
train_iterator
=
trange
(
epochs_trained
,
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
],
epochs_trained
,
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
],
)
set_seed
(
args
)
# Added here for reproductibility
for
_
in
train_iterator
:
...
...
@@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
)
features
=
convert_examples_to_features
(
examples
,
tokenizer
,
label_list
=
label_list
,
max_length
=
args
.
max_seq_length
,
output_mode
=
output_mode
,
examples
,
tokenizer
,
label_list
=
label_list
,
max_length
=
args
.
max_seq_length
,
output_mode
=
output_mode
,
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
...
@@ -434,15 +445,24 @@ def main():
help
=
"The output directory where the model predictions and checkpoints will be written."
,
)
parser
.
add_argument
(
"--patience"
,
default
=
"0"
,
type
=
str
,
required
=
False
,
"--patience"
,
default
=
"0"
,
type
=
str
,
required
=
False
,
)
parser
.
add_argument
(
"--regression_threshold"
,
default
=
0
,
type
=
float
,
required
=
False
,
"--regression_threshold"
,
default
=
0
,
type
=
float
,
required
=
False
,
)
# Other parameters
parser
.
add_argument
(
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
,
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--tokenizer_name"
,
...
...
@@ -466,17 +486,27 @@ def main():
parser
.
add_argument
(
"--do_train"
,
action
=
"store_true"
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
"store_true"
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--evaluate_during_training"
,
action
=
"store_true"
,
help
=
"Run evaluation during training at each logging step."
,
"--evaluate_during_training"
,
action
=
"store_true"
,
help
=
"Run evaluation during training at each logging step."
,
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
"store_true"
,
help
=
"Set this flag if you are using an uncased model."
,
"--do_lower_case"
,
action
=
"store_true"
,
help
=
"Set this flag if you are using an uncased model."
,
)
parser
.
add_argument
(
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
,
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
,
)
parser
.
add_argument
(
"--per_gpu_eval_batch_size"
,
default
=
1
,
type
=
int
,
help
=
"Batch size per GPU/CPU for evaluation."
,
"--per_gpu_eval_batch_size"
,
default
=
1
,
type
=
int
,
help
=
"Batch size per GPU/CPU for evaluation."
,
)
parser
.
add_argument
(
"--gradient_accumulation_steps"
,
...
...
@@ -485,13 +515,19 @@ def main():
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
,
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
,
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
,
)
parser
.
add_argument
(
"--weight_decay"
,
default
=
0.0
,
type
=
float
,
help
=
"Weight decay if we apply some."
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
,
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
,
)
parser
.
add_argument
(
"--max_steps"
,
...
...
@@ -503,7 +539,10 @@ def main():
parser
.
add_argument
(
"--logging_steps"
,
type
=
int
,
default
=
500
,
help
=
"Log every X updates steps."
)
parser
.
add_argument
(
"--save_steps"
,
type
=
int
,
default
=
500
,
help
=
"Save checkpoint every X updates steps."
,
"--save_steps"
,
type
=
int
,
default
=
500
,
help
=
"Save checkpoint every X updates steps."
,
)
parser
.
add_argument
(
"--eval_all_checkpoints"
,
...
...
@@ -512,10 +551,14 @@ def main():
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
"store_true"
,
help
=
"Avoid using CUDA when available"
)
parser
.
add_argument
(
"--overwrite_output_dir"
,
action
=
"store_true"
,
help
=
"Overwrite the content of the output directory"
,
"--overwrite_output_dir"
,
action
=
"store_true"
,
help
=
"Overwrite the content of the output directory"
,
)
parser
.
add_argument
(
"--overwrite_cache"
,
action
=
"store_true"
,
help
=
"Overwrite the cached training and evaluation sets"
,
"--overwrite_cache"
,
action
=
"store_true"
,
help
=
"Overwrite the cached training and evaluation sets"
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
...
...
@@ -532,7 +575,10 @@ def main():
"See details at https://nvidia.github.io/apex/amp.html"
,
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"For distributed training: local_rank"
,
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"For distributed training: local_rank"
,
)
parser
.
add_argument
(
"--server_ip"
,
type
=
str
,
default
=
""
,
help
=
"For distant debugging."
)
parser
.
add_argument
(
"--server_port"
,
type
=
str
,
default
=
""
,
help
=
"For distant debugging."
)
...
...
@@ -634,7 +680,8 @@ def main():
print
(
"Output Layers Parameters:"
,
output_layers_param_num
)
single_output_layer_param_num
=
sum
(
param
.
numel
()
for
param
in
model
.
classifiers
[
0
].
parameters
())
print
(
"Added Output Layers Parameters:"
,
output_layers_param_num
-
single_output_layer_param_num
,
"Added Output Layers Parameters:"
,
output_layers_param_num
-
single_output_layer_param_num
,
)
logger
.
info
(
"Training/evaluation parameters %s"
,
args
)
...
...
examples/bertology/run_bertology.py
View file @
a75c64d8
...
...
@@ -66,7 +66,7 @@ def print_2d_tensor(tensor):
def
compute_heads_importance
(
args
,
model
,
eval_dataloader
,
compute_entropy
=
True
,
compute_importance
=
True
,
head_mask
=
None
,
actually_pruned
=
False
):
"""
This method shows how to compute:
"""This method shows how to compute:
- head attention entropy
- head importance scores according to http://arxiv.org/abs/1905.10650
"""
...
...
@@ -150,7 +150,7 @@ def compute_heads_importance(
def
mask_heads
(
args
,
model
,
eval_dataloader
):
"""
This method shows how to mask head (set some heads to zero), to test the effect on the network,
"""This method shows how to mask head (set some heads to zero), to test the effect on the network,
based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
"""
_
,
head_importance
,
preds
,
labels
=
compute_heads_importance
(
args
,
model
,
eval_dataloader
,
compute_entropy
=
False
)
...
...
@@ -201,7 +201,7 @@ def mask_heads(args, model, eval_dataloader):
def
prune_heads
(
args
,
model
,
eval_dataloader
,
head_mask
):
"""
This method shows how to prune head (remove heads weights) based on
"""This method shows how to prune head (remove heads weights) based on
the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
"""
# Try pruning and test time speedup
...
...
@@ -395,7 +395,8 @@ def main():
cache_dir
=
args
.
cache_dir
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,
cache_dir
=
args
.
cache_dir
,
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,
cache_dir
=
args
.
cache_dir
,
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
args
.
model_name_or_path
,
...
...
examples/contrib/mm-imdb/utils_mmimdb.py
View file @
a75c64d8
...
...
@@ -138,6 +138,9 @@ def get_image_transforms():
transforms
.
Resize
(
256
),
transforms
.
CenterCrop
(
224
),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
[
0.46777044
,
0.44531429
,
0.40661017
],
std
=
[
0.12221994
,
0.12145835
,
0.14380469
],),
transforms
.
Normalize
(
mean
=
[
0.46777044
,
0.44531429
,
0.40661017
],
std
=
[
0.12221994
,
0.12145835
,
0.14380469
],
),
]
)
examples/contrib/run_camembert.py
View file @
a75c64d8
...
...
@@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
)
else
:
topk_filled_outputs
.
append
(
(
masked_input
.
replace
(
masked_token
,
predicted_token
),
values
[
index
].
item
(),
predicted_token
,)
(
masked_input
.
replace
(
masked_token
,
predicted_token
),
values
[
index
].
item
(),
predicted_token
,
)
)
return
topk_filled_outputs
...
...
examples/contrib/run_openai_gpt.py
View file @
a75c64d8
...
...
@@ -71,7 +71,7 @@ def load_rocstories_dataset(dataset_path):
def
pre_process_datasets
(
encoded_datasets
,
input_len
,
cap_length
,
start_token
,
delimiter_token
,
clf_token
):
"""
Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
"""Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
...
...
@@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
mc_token_ids
=
np
.
zeros
((
n_batch
,
2
),
dtype
=
np
.
int64
)
lm_labels
=
np
.
full
((
n_batch
,
2
,
input_len
),
fill_value
=-
100
,
dtype
=
np
.
int64
)
mc_labels
=
np
.
zeros
((
n_batch
,),
dtype
=
np
.
int64
)
for
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
in
enumerate
(
dataset
):
for
(
i
,
(
story
,
cont1
,
cont2
,
mc_label
),
)
in
enumerate
(
dataset
):
with_cont1
=
[
start_token
]
+
story
[:
cap_length
]
+
[
delimiter_token
]
+
cont1
[:
cap_length
]
+
[
clf_token
]
with_cont2
=
[
start_token
]
+
story
[:
cap_length
]
+
[
delimiter_token
]
+
cont2
[:
cap_length
]
+
[
clf_token
]
input_ids
[
i
,
0
,
:
len
(
with_cont1
)]
=
with_cont1
...
...
examples/contrib/run_swag.py
View file @
a75c64d8
...
...
@@ -629,7 +629,9 @@ def main():
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
config
=
AutoConfig
.
from_pretrained
(
args
.
config_name
if
args
.
config_name
else
args
.
model_name_or_path
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,
)
model
=
AutoModelForMultipleChoice
.
from_pretrained
(
args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
args
.
model_name_or_path
),
config
=
config
)
...
...
examples/deebert/run_glue_deebert.py
View file @
a75c64d8
...
...
@@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
)
features
=
convert_examples_to_features
(
examples
,
tokenizer
,
label_list
=
label_list
,
max_length
=
args
.
max_seq_length
,
output_mode
=
output_mode
,
examples
,
tokenizer
,
label_list
=
label_list
,
max_length
=
args
.
max_seq_length
,
output_mode
=
output_mode
,
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
...
examples/deebert/src/modeling_highway_bert.py
View file @
a75c64d8
...
...
@@ -14,8 +14,7 @@ from transformers.modeling_bert import (
def
entropy
(
x
):
""" Calculate entropy of a pre-softmax logit Tensor
"""
"""Calculate entropy of a pre-softmax logit Tensor"""
exp_x
=
torch
.
exp
(
x
)
A
=
torch
.
sum
(
exp_x
,
dim
=
1
)
# sum of exp(x_i)
B
=
torch
.
sum
(
x
*
exp_x
,
dim
=
1
)
# sum of x_i * exp(x_i)
...
...
@@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module):
@
add_start_docstrings
(
"The Bert Model transformer with early exiting (DeeBERT). "
,
BERT_START_DOCSTRING
,
"The Bert Model transformer with early exiting (DeeBERT). "
,
BERT_START_DOCSTRING
,
)
class
DeeBertModel
(
BertPreTrainedModel
):
def
__init__
(
self
,
config
):
...
...
@@ -127,7 +127,7 @@ class DeeBertModel(BertPreTrainedModel):
self
.
embeddings
.
word_embeddings
=
value
def
_prune_heads
(
self
,
heads_to_prune
):
"""
Prunes heads of the model.
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
...
...
examples/deebert/src/modeling_highway_roberta.py
View file @
a75c64d8
...
...
@@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc
@
add_start_docstrings
(
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). "
,
ROBERTA_START_DOCSTRING
,
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). "
,
ROBERTA_START_DOCSTRING
,
)
class
DeeRobertaModel
(
DeeBertModel
):
...
...
examples/distillation/run_squad_w_distillation.py
View file @
a75c64d8
...
...
@@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
assert
end_logits_tea
.
size
()
==
end_logits_stu
.
size
()
loss_fct
=
nn
.
KLDivLoss
(
reduction
=
"batchmean"
)
loss_start
=
loss_fct
(
loss_start
=
(
loss_fct
(
F
.
log_softmax
(
start_logits_stu
/
args
.
temperature
,
dim
=-
1
),
F
.
softmax
(
start_logits_tea
/
args
.
temperature
,
dim
=-
1
),
)
*
(
args
.
temperature
**
2
)
loss_end
=
loss_fct
(
)
*
(
args
.
temperature
**
2
)
)
loss_end
=
(
loss_fct
(
F
.
log_softmax
(
end_logits_stu
/
args
.
temperature
,
dim
=-
1
),
F
.
softmax
(
end_logits_tea
/
args
.
temperature
,
dim
=-
1
),
)
*
(
args
.
temperature
**
2
)
)
*
(
args
.
temperature
**
2
)
)
loss_ce
=
(
loss_start
+
loss_end
)
/
2.0
loss
=
args
.
alpha_ce
*
loss_ce
+
args
.
alpha_squad
*
loss
...
...
examples/distillation/utils.py
View file @
a75c64d8
...
...
@@ -118,7 +118,8 @@ def init_gpu_params(params):
if
params
.
multi_gpu
:
logger
.
info
(
"Initializing PyTorch distributed"
)
torch
.
distributed
.
init_process_group
(
init_method
=
"env://"
,
backend
=
"nccl"
,
init_method
=
"env://"
,
backend
=
"nccl"
,
)
...
...
examples/language-modeling/run_language_modeling.py
View file @
a75c64d8
...
...
@@ -233,7 +233,9 @@ def main():
eval_dataset
=
get_dataset
(
data_args
,
tokenizer
=
tokenizer
,
evaluate
=
True
)
if
training_args
.
do_eval
else
None
if
config
.
model_type
==
"xlnet"
:
data_collator
=
DataCollatorForPermutationLanguageModeling
(
tokenizer
=
tokenizer
,
plm_probability
=
data_args
.
plm_probability
,
max_span_length
=
data_args
.
max_span_length
,
tokenizer
=
tokenizer
,
plm_probability
=
data_args
.
plm_probability
,
max_span_length
=
data_args
.
max_span_length
,
)
else
:
data_collator
=
DataCollatorForLanguageModeling
(
...
...
examples/lightning_base.py
View file @
a75c64d8
...
...
@@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule):
help
=
"Decoder layer dropout probability (Optional). Goes into model.config"
,
)
parser
.
add_argument
(
"--dropout"
,
type
=
float
,
help
=
"Dropout probability (Optional). Goes into model.config"
,
"--dropout"
,
type
=
float
,
help
=
"Dropout probability (Optional). Goes into model.config"
,
)
parser
.
add_argument
(
"--attention_dropout"
,
type
=
float
,
help
=
"Attention dropout probability (Optional). Goes into model.config"
,
"--attention_dropout"
,
type
=
float
,
help
=
"Attention dropout probability (Optional). Goes into model.config"
,
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
...
...
examples/longform-qa/eli5_app.py
View file @
a75c64d8
...
...
@@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10):
)
else
:
support_doc
,
hit_lst
=
query_es_index
(
question
,
es_client
,
index_name
=
"english_wiki40b_snippets_100w"
,
n_results
=
n_results
,
question
,
es_client
,
index_name
=
"english_wiki40b_snippets_100w"
,
n_results
=
n_results
,
)
support_list
=
[
(
res
[
"article_title"
],
res
[
"section_title"
].
strip
(),
res
[
"score"
],
res
[
"passage_text"
])
for
res
in
hit_lst
...
...
@@ -154,7 +157,8 @@ header_full = """
header_html
,
)
st
.
sidebar
.
markdown
(
header_full
,
unsafe_allow_html
=
True
,
header_full
,
unsafe_allow_html
=
True
,
)
# Long Form QA with ELI5 and Wikipedia
...
...
@@ -173,9 +177,17 @@ action_list = [
]
demo_options
=
st
.
sidebar
.
checkbox
(
"Demo options"
)
if
demo_options
:
action_st
=
st
.
sidebar
.
selectbox
(
""
,
action_list
,
index
=
3
,)
action_st
=
st
.
sidebar
.
selectbox
(
""
,
action_list
,
index
=
3
,
)
action
=
action_list
.
index
(
action_st
)
show_type
=
st
.
sidebar
.
selectbox
(
""
,
[
"Show full text of passages"
,
"Show passage section titles"
],
index
=
0
,)
show_type
=
st
.
sidebar
.
selectbox
(
""
,
[
"Show full text of passages"
,
"Show passage section titles"
],
index
=
0
,
)
show_passages
=
show_type
==
"Show full text of passages"
else
:
action
=
3
...
...
@@ -250,7 +262,9 @@ questions_list = [
"How does New Zealand have so many large bird predators?"
,
]
question_s
=
st
.
selectbox
(
"What would you like to ask? ---- select <MY QUESTION> to enter a new query"
,
questions_list
,
index
=
1
,
"What would you like to ask? ---- select <MY QUESTION> to enter a new query"
,
questions_list
,
index
=
1
,
)
if
question_s
==
"<MY QUESTION>"
:
question
=
st
.
text_input
(
"Enter your question here:"
,
""
)
...
...
examples/longform-qa/eli5_utils.py
View file @
a75c64d8
...
...
@@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki
yield
passage
# create the ES index
for
ok
,
action
in
streaming_bulk
(
client
=
es_client
,
index
=
index_name
,
actions
=
passage_generator
(),):
for
ok
,
action
in
streaming_bulk
(
client
=
es_client
,
index
=
index_name
,
actions
=
passage_generator
(),
):
progress
.
update
(
1
)
successes
+=
ok
print
(
"Indexed %d documents"
%
(
successes
,))
...
...
@@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module):
# define function for checkpointing
def
partial_encode
(
*
inputs
):
encoder_outputs
=
self
.
sent_encoder
.
encoder
(
inputs
[
0
],
attention_mask
=
inputs
[
1
],
head_mask
=
head_mask
,)
encoder_outputs
=
self
.
sent_encoder
.
encoder
(
inputs
[
0
],
attention_mask
=
inputs
[
1
],
head_mask
=
head_mask
,
)
sequence_output
=
encoder_outputs
[
0
]
pooled_output
=
self
.
sent_encoder
.
pooler
(
sequence_output
)
return
pooled_output
...
...
@@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar
if
step
%
args
.
print_freq
==
0
or
step
==
1
:
print
(
"{:2d} {:5d} of {:5d}
\t
L: {:.3f}
\t
-- {:.3f}"
.
format
(
e
,
step
,
len
(
dataset
)
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
e
,
step
,
len
(
dataset
)
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
)
)
loc_loss
=
0
...
...
@@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc
if
step
%
args
.
print_freq
==
0
:
print
(
"{:2d} {:5d} of {:5d}
\t
L: {:.3f}
\t
-- {:.3f}"
.
format
(
e
,
step
,
len
(
dataset_list
[
0
])
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
e
,
step
,
len
(
dataset_list
[
0
])
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
)
)
loc_loss
=
0
...
...
@@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset):
self
.
document_cache
[
q_id
]
=
self
.
document_cache
.
get
(
q_id
,
self
.
make_doc_function
(
example
[
"title"
]))
document
=
self
.
document_cache
[
q_id
]
in_st
=
"question: {} context: {}"
.
format
(
question
.
lower
().
replace
(
" --t--"
,
""
).
strip
(),
document
.
lower
().
strip
(),
question
.
lower
().
replace
(
" --t--"
,
""
).
strip
(),
document
.
lower
().
strip
(),
)
out_st
=
answer
return
(
in_st
,
out_st
)
...
...
@@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=
if
step
%
args
.
print_freq
==
0
or
step
==
1
:
print
(
"{:2d} {:5d} of {:5d}
\t
L: {:.3f}
\t
-- {:.3f}"
.
format
(
e
,
step
,
len
(
dataset
)
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
e
,
step
,
len
(
dataset
)
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
)
)
loc_loss
=
0
...
...
@@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
if
step
%
args
.
print_freq
==
0
:
print
(
"{:5d} of {:5d}
\t
L: {:.3f}
\t
-- {:.3f}"
.
format
(
step
,
len
(
dataset
)
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
step
,
len
(
dataset
)
//
args
.
batch_size
,
loc_loss
/
loc_steps
,
time
()
-
st_time
,
)
)
print
(
"Total
\t
L: {:.3f}
\t
-- {:.3f}"
.
format
(
loc_loss
/
loc_steps
,
time
()
-
st_time
,
)
)
print
(
"Total
\t
L: {:.3f}
\t
-- {:.3f}"
.
format
(
loc_loss
/
loc_steps
,
time
()
-
st_time
,))
def
train_qa_s2s
(
qa_s2s_model
,
qa_s2s_tokenizer
,
s2s_train_dset
,
s2s_valid_dset
,
s2s_args
):
...
...
@@ -506,7 +535,12 @@ def qa_s2s_generate(
max_input_length
=
512
,
device
=
"cuda:0"
,
):
model_inputs
=
make_qa_s2s_batch
([(
question_doc
,
"A"
)],
qa_s2s_tokenizer
,
max_input_length
,
device
=
device
,)
model_inputs
=
make_qa_s2s_batch
(
[(
question_doc
,
"A"
)],
qa_s2s_tokenizer
,
max_input_length
,
device
=
device
,
)
n_beams
=
num_answers
if
num_beams
is
None
else
max
(
num_beams
,
num_answers
)
generated_ids
=
qa_s2s_model
.
generate
(
input_ids
=
model_inputs
[
"input_ids"
],
...
...
examples/movement-pruning/emmental/modeling_bert_masked.py
View file @
a75c64d8
...
...
@@ -37,8 +37,7 @@ logger = logging.getLogger(__name__)
class
BertEmbeddings
(
nn
.
Module
):
"""Construct the embeddings from word, position and token_type embeddings.
"""
"""Construct the embeddings from word, position and token_type embeddings."""
def
__init__
(
self
,
config
):
super
().
__init__
()
...
...
@@ -385,7 +384,7 @@ class BertPooler(nn.Module):
class
MaskedBertPreTrainedModel
(
PreTrainedModel
):
"""
An abstract class to handle weights initialization and
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
"""
...
...
@@ -492,7 +491,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
self
.
embeddings
.
word_embeddings
=
value
def
_prune_heads
(
self
,
heads_to_prune
):
"""
Prunes heads of the model.
"""Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
...
...
@@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
start_logits
=
start_logits
.
squeeze
(
-
1
)
end_logits
=
end_logits
.
squeeze
(
-
1
)
outputs
=
(
start_logits
,
end_logits
,)
+
outputs
[
2
:]
outputs
=
(
start_logits
,
end_logits
,
)
+
outputs
[
2
:]
if
start_positions
is
not
None
and
end_positions
is
not
None
:
# If we are on multi-GPU, split add a dimension
if
len
(
start_positions
.
size
())
>
1
:
...
...
Prev
1
2
3
4
5
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment