"web/git@developer.sourcefind.cn:chenpangpang/ComfyUI.git" did not exist on "74297f5f9dad140ef209e6f89b38d88839bae6fe"
Commit a75c64d8 authored by Lysandre's avatar Lysandre
Browse files

Black 20 release

parent e78c1103
...@@ -112,7 +112,10 @@ if is_torch_available(): ...@@ -112,7 +112,10 @@ if is_torch_available():
cached_features_file = os.path.join( cached_features_file = os.path.join(
data_dir, data_dir,
"cached_{}_{}_{}_{}".format( "cached_{}_{}_{}_{}".format(
"dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task, "dev" if evaluate else "train",
tokenizer.__class__.__name__,
str(max_seq_length),
task,
), ),
) )
label_list = processor.get_labels() label_list = processor.get_labels()
...@@ -278,7 +281,10 @@ class HansProcessor(DataProcessor): ...@@ -278,7 +281,10 @@ class HansProcessor(DataProcessor):
def hans_convert_examples_to_features( def hans_convert_examples_to_features(
examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer, examples: List[InputExample],
label_list: List[str],
max_length: int,
tokenizer: PreTrainedTokenizer,
): ):
""" """
Loads a data file into a list of ``InputFeatures`` Loads a data file into a list of ``InputFeatures``
......
...@@ -20,7 +20,9 @@ class PlotArguments: ...@@ -20,7 +20,9 @@ class PlotArguments:
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
""" """
csv_file: str = field(metadata={"help": "The csv file to plot."},) csv_file: str = field(
metadata={"help": "The csv file to plot."},
)
plot_along_batch: bool = field( plot_along_batch: bool = field(
default=False, default=False,
metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."}, metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
...@@ -30,7 +32,8 @@ class PlotArguments: ...@@ -30,7 +32,8 @@ class PlotArguments:
metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."}, metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
) )
no_log_scale: bool = field( no_log_scale: bool = field(
default=False, metadata={"help": "Disable logarithmic scale when plotting"}, default=False,
metadata={"help": "Disable logarithmic scale when plotting"},
) )
is_train: bool = field( is_train: bool = field(
default=False, default=False,
...@@ -39,7 +42,8 @@ class PlotArguments: ...@@ -39,7 +42,8 @@ class PlotArguments:
}, },
) )
figure_png_file: Optional[str] = field( figure_png_file: Optional[str] = field(
default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."}, default=None,
metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
) )
short_model_names: Optional[List[str]] = list_field( short_model_names: Optional[List[str]] = list_field(
default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."} default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
......
...@@ -101,30 +101,30 @@ class AlbertModelWithPabee(AlbertModel): ...@@ -101,30 +101,30 @@ class AlbertModelWithPabee(AlbertModel):
regression=False, regression=False,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification) layer weights are trained from the next sentence prediction (classification)
objective during pre-training. objective during pre-training.
This output is usually *not* a good summary This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
...@@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel): ...@@ -157,7 +157,10 @@ class AlbertModelWithPabee(AlbertModel):
res = [] res = []
for i in range(self.config.num_hidden_layers): for i in range(self.config.num_hidden_layers):
encoder_outputs = self.encoder.adaptive_forward( encoder_outputs = self.encoder.adaptive_forward(
encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_outputs,
current_layer=i,
attention_mask=extended_attention_mask,
head_mask=head_mask,
) )
pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0])) pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
...@@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel): ...@@ -174,7 +177,10 @@ class AlbertModelWithPabee(AlbertModel):
for i in range(self.config.num_hidden_layers): for i in range(self.config.num_hidden_layers):
calculated_layer_num += 1 calculated_layer_num += 1
encoder_outputs = self.encoder.adaptive_forward( encoder_outputs = self.encoder.adaptive_forward(
encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_outputs,
current_layer=i,
attention_mask=extended_attention_mask,
head_mask=head_mask,
) )
pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0])) pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
...@@ -236,42 +242,42 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel): ...@@ -236,42 +242,42 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
labels=None, labels=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Examples::
from transformers import AlbertTokenizer from transformers import AlbertTokenizer
from pabee import AlbertForSequenceClassificationWithPabee from pabee import AlbertForSequenceClassificationWithPabee
import torch import torch
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2') model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
......
...@@ -108,30 +108,30 @@ class BertModelWithPabee(BertModel): ...@@ -108,30 +108,30 @@ class BertModelWithPabee(BertModel):
regression=False, regression=False,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification) layer weights are trained from the next sentence prediction (classification)
objective during pre-training. objective during pre-training.
This output is usually *not* a good summary This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
...@@ -266,44 +266,44 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel): ...@@ -266,44 +266,44 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
labels=None, labels=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
Examples:: Examples::
from transformers import BertTokenizer, BertForSequenceClassification from transformers import BertTokenizer, BertForSequenceClassification
from pabee import BertForSequenceClassificationWithPabee from pabee import BertForSequenceClassificationWithPabee
import torch import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased') model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels) outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2] loss, logits = outputs[:2]
""" """
......
...@@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer): ...@@ -120,7 +120,10 @@ def train(args, train_dataset, model, tokenizer):
# Distributed training (should be after apex fp16 initialization) # Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1: if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel( model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, model,
device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=True,
) )
# Train! # Train!
...@@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer): ...@@ -151,13 +154,17 @@ def train(args, train_dataset, model, tokenizer):
logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from epoch %d", epochs_trained)
logger.info(" Continuing training from global step %d", global_step) logger.info(" Continuing training from global step %d", global_step)
logger.info( logger.info(
" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch, " Will skip the first %d steps in the first epoch",
steps_trained_in_current_epoch,
) )
tr_loss, logging_loss = 0.0, 0.0 tr_loss, logging_loss = 0.0, 0.0
model.zero_grad() model.zero_grad()
train_iterator = trange( train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], epochs_trained,
int(args.num_train_epochs),
desc="Epoch",
disable=args.local_rank not in [-1, 0],
) )
set_seed(args) # Added here for reproductibility set_seed(args) # Added here for reproductibility
for _ in train_iterator: for _ in train_iterator:
...@@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -372,7 +379,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
) )
features = convert_examples_to_features( features = convert_examples_to_features(
examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, examples,
tokenizer,
label_list=label_list,
max_length=args.max_seq_length,
output_mode=output_mode,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
...@@ -434,15 +445,24 @@ def main(): ...@@ -434,15 +445,24 @@ def main():
help="The output directory where the model predictions and checkpoints will be written.", help="The output directory where the model predictions and checkpoints will be written.",
) )
parser.add_argument( parser.add_argument(
"--patience", default="0", type=str, required=False, "--patience",
default="0",
type=str,
required=False,
) )
parser.add_argument( parser.add_argument(
"--regression_threshold", default=0, type=float, required=False, "--regression_threshold",
default=0,
type=float,
required=False,
) )
# Other parameters # Other parameters
parser.add_argument( parser.add_argument(
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", "--config_name",
default="",
type=str,
help="Pretrained config name or path if not the same as model_name",
) )
parser.add_argument( parser.add_argument(
"--tokenizer_name", "--tokenizer_name",
...@@ -466,17 +486,27 @@ def main(): ...@@ -466,17 +486,27 @@ def main():
parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument( parser.add_argument(
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.", "--evaluate_during_training",
action="store_true",
help="Run evaluation during training at each logging step.",
) )
parser.add_argument( parser.add_argument(
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", "--do_lower_case",
action="store_true",
help="Set this flag if you are using an uncased model.",
) )
parser.add_argument( parser.add_argument(
"--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", "--per_gpu_train_batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.",
) )
parser.add_argument( parser.add_argument(
"--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.", "--per_gpu_eval_batch_size",
default=1,
type=int,
help="Batch size per GPU/CPU for evaluation.",
) )
parser.add_argument( parser.add_argument(
"--gradient_accumulation_steps", "--gradient_accumulation_steps",
...@@ -485,13 +515,19 @@ def main(): ...@@ -485,13 +515,19 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.", help="Number of updates steps to accumulate before performing a backward/update pass.",
) )
parser.add_argument( parser.add_argument(
"--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", "--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.",
) )
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument( parser.add_argument(
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", "--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.",
) )
parser.add_argument( parser.add_argument(
"--max_steps", "--max_steps",
...@@ -503,7 +539,10 @@ def main(): ...@@ -503,7 +539,10 @@ def main():
parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
parser.add_argument( parser.add_argument(
"--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.", "--save_steps",
type=int,
default=500,
help="Save checkpoint every X updates steps.",
) )
parser.add_argument( parser.add_argument(
"--eval_all_checkpoints", "--eval_all_checkpoints",
...@@ -512,10 +551,14 @@ def main(): ...@@ -512,10 +551,14 @@ def main():
) )
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument( parser.add_argument(
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", "--overwrite_output_dir",
action="store_true",
help="Overwrite the content of the output directory",
) )
parser.add_argument( parser.add_argument(
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", "--overwrite_cache",
action="store_true",
help="Overwrite the cached training and evaluation sets",
) )
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
...@@ -532,7 +575,10 @@ def main(): ...@@ -532,7 +575,10 @@ def main():
"See details at https://nvidia.github.io/apex/amp.html", "See details at https://nvidia.github.io/apex/amp.html",
) )
parser.add_argument( parser.add_argument(
"--local_rank", type=int, default=-1, help="For distributed training: local_rank", "--local_rank",
type=int,
default=-1,
help="For distributed training: local_rank",
) )
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
...@@ -634,7 +680,8 @@ def main(): ...@@ -634,7 +680,8 @@ def main():
print("Output Layers Parameters:", output_layers_param_num) print("Output Layers Parameters:", output_layers_param_num)
single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters()) single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
print( print(
"Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num, "Added Output Layers Parameters:",
output_layers_param_num - single_output_layer_param_num,
) )
logger.info("Training/evaluation parameters %s", args) logger.info("Training/evaluation parameters %s", args)
......
...@@ -66,9 +66,9 @@ def print_2d_tensor(tensor): ...@@ -66,9 +66,9 @@ def print_2d_tensor(tensor):
def compute_heads_importance( def compute_heads_importance(
args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
): ):
""" This method shows how to compute: """This method shows how to compute:
- head attention entropy - head attention entropy
- head importance scores according to http://arxiv.org/abs/1905.10650 - head importance scores according to http://arxiv.org/abs/1905.10650
""" """
# Prepare our tensors # Prepare our tensors
n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
...@@ -150,8 +150,8 @@ def compute_heads_importance( ...@@ -150,8 +150,8 @@ def compute_heads_importance(
def mask_heads(args, model, eval_dataloader): def mask_heads(args, model, eval_dataloader):
""" This method shows how to mask head (set some heads to zero), to test the effect on the network, """This method shows how to mask head (set some heads to zero), to test the effect on the network,
based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
""" """
_, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
...@@ -201,8 +201,8 @@ def mask_heads(args, model, eval_dataloader): ...@@ -201,8 +201,8 @@ def mask_heads(args, model, eval_dataloader):
def prune_heads(args, model, eval_dataloader, head_mask): def prune_heads(args, model, eval_dataloader, head_mask):
""" This method shows how to prune head (remove heads weights) based on """This method shows how to prune head (remove heads weights) based on
the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
""" """
# Try pruning and test time speedup # Try pruning and test time speedup
# Pruning is like masking but we actually remove the masked weights # Pruning is like masking but we actually remove the masked weights
...@@ -395,7 +395,8 @@ def main(): ...@@ -395,7 +395,8 @@ def main():
cache_dir=args.cache_dir, cache_dir=args.cache_dir,
) )
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
cache_dir=args.cache_dir,
) )
model = AutoModelForSequenceClassification.from_pretrained( model = AutoModelForSequenceClassification.from_pretrained(
args.model_name_or_path, args.model_name_or_path,
......
...@@ -138,6 +138,9 @@ def get_image_transforms(): ...@@ -138,6 +138,9 @@ def get_image_transforms():
transforms.Resize(256), transforms.Resize(256),
transforms.CenterCrop(224), transforms.CenterCrop(224),
transforms.ToTensor(), transforms.ToTensor(),
transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],), transforms.Normalize(
mean=[0.46777044, 0.44531429, 0.40661017],
std=[0.12221994, 0.12145835, 0.14380469],
),
] ]
) )
...@@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5): ...@@ -30,7 +30,11 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
) )
else: else:
topk_filled_outputs.append( topk_filled_outputs.append(
(masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,) (
masked_input.replace(masked_token, predicted_token),
values[index].item(),
predicted_token,
)
) )
return topk_filled_outputs return topk_filled_outputs
......
...@@ -71,10 +71,10 @@ def load_rocstories_dataset(dataset_path): ...@@ -71,10 +71,10 @@ def load_rocstories_dataset(dataset_path):
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token): def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
""" Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label) """Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation: To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
""" """
tensor_datasets = [] tensor_datasets = []
for dataset in encoded_datasets: for dataset in encoded_datasets:
...@@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d ...@@ -83,7 +83,10 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64) mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64) lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
mc_labels = np.zeros((n_batch,), dtype=np.int64) mc_labels = np.zeros((n_batch,), dtype=np.int64)
for i, (story, cont1, cont2, mc_label), in enumerate(dataset): for (
i,
(story, cont1, cont2, mc_label),
) in enumerate(dataset):
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token] with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
input_ids[i, 0, : len(with_cont1)] = with_cont1 input_ids[i, 0, : len(with_cont1)] = with_cont1
......
...@@ -629,7 +629,9 @@ def main(): ...@@ -629,7 +629,9 @@ def main():
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,) tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
)
model = AutoModelForMultipleChoice.from_pretrained( model = AutoModelForMultipleChoice.from_pretrained(
args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
) )
......
...@@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): ...@@ -358,7 +358,11 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
) )
features = convert_examples_to_features( features = convert_examples_to_features(
examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, examples,
tokenizer,
label_list=label_list,
max_length=args.max_seq_length,
output_mode=output_mode,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
......
...@@ -14,8 +14,7 @@ from transformers.modeling_bert import ( ...@@ -14,8 +14,7 @@ from transformers.modeling_bert import (
def entropy(x): def entropy(x):
""" Calculate entropy of a pre-softmax logit Tensor """Calculate entropy of a pre-softmax logit Tensor"""
"""
exp_x = torch.exp(x) exp_x = torch.exp(x)
A = torch.sum(exp_x, dim=1) # sum of exp(x_i) A = torch.sum(exp_x, dim=1) # sum of exp(x_i)
B = torch.sum(x * exp_x, dim=1) # sum of x_i * exp(x_i) B = torch.sum(x * exp_x, dim=1) # sum of x_i * exp(x_i)
...@@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module): ...@@ -104,7 +103,8 @@ class DeeBertEncoder(nn.Module):
@add_start_docstrings( @add_start_docstrings(
"The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING, "The Bert Model transformer with early exiting (DeeBERT). ",
BERT_START_DOCSTRING,
) )
class DeeBertModel(BertPreTrainedModel): class DeeBertModel(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
...@@ -127,9 +127,9 @@ class DeeBertModel(BertPreTrainedModel): ...@@ -127,9 +127,9 @@ class DeeBertModel(BertPreTrainedModel):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
...@@ -147,33 +147,33 @@ class DeeBertModel(BertPreTrainedModel): ...@@ -147,33 +147,33 @@ class DeeBertModel(BertPreTrainedModel):
encoder_attention_mask=None, encoder_attention_mask=None,
): ):
r""" r"""
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model. Sequence of hidden-states at the output of the last layer of the model.
pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
Last layer hidden-state of the first token of the sequence (classification token) Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification) layer weights are trained from the next sentence prediction (classification)
objective during pre-training. objective during pre-training.
This output is usually *not* a good summary This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence. the sequence of hidden-states for the whole input sequence.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
highway_exits (:obj:`tuple(tuple(torch.Tensor))`: highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
Tuple of each early exit's results (total length: number of layers) Tuple of each early exit's results (total length: number of layers)
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states. Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
""" """
if input_ids is not None and inputs_embeds is not None: if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
...@@ -302,32 +302,32 @@ class DeeBertForSequenceClassification(BertPreTrainedModel): ...@@ -302,32 +302,32 @@ class DeeBertForSequenceClassification(BertPreTrainedModel):
train_highway=False, train_highway=False,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
highway_exits (:obj:`tuple(tuple(torch.Tensor))`: highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
Tuple of each early exit's results (total length: number of layers) Tuple of each early exit's results (total length: number of layers)
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states. Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
""" """
exit_layer = self.num_layers exit_layer = self.num_layers
......
...@@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc ...@@ -11,7 +11,8 @@ from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayExc
@add_start_docstrings( @add_start_docstrings(
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING, "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
ROBERTA_START_DOCSTRING,
) )
class DeeRobertaModel(DeeBertModel): class DeeRobertaModel(DeeBertModel):
...@@ -58,32 +59,32 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel): ...@@ -58,32 +59,32 @@ class DeeRobertaForSequenceClassification(BertPreTrainedModel):
train_highway=False, train_highway=False,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss. Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
highway_exits (:obj:`tuple(tuple(torch.Tensor))`: highway_exits (:obj:`tuple(tuple(torch.Tensor))`:
Tuple of each early exit's results (total length: number of layers) Tuple of each early exit's results (total length: number of layers)
Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states. Each tuple is again, a tuple of length 2 - the first entry is logits and the second entry is hidden states.
""" """
exit_layer = self.num_layers exit_layer = self.num_layers
......
...@@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None): ...@@ -228,14 +228,20 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
assert end_logits_tea.size() == end_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size()
loss_fct = nn.KLDivLoss(reduction="batchmean") loss_fct = nn.KLDivLoss(reduction="batchmean")
loss_start = loss_fct( loss_start = (
F.log_softmax(start_logits_stu / args.temperature, dim=-1), loss_fct(
F.softmax(start_logits_tea / args.temperature, dim=-1), F.log_softmax(start_logits_stu / args.temperature, dim=-1),
) * (args.temperature ** 2) F.softmax(start_logits_tea / args.temperature, dim=-1),
loss_end = loss_fct( )
F.log_softmax(end_logits_stu / args.temperature, dim=-1), * (args.temperature ** 2)
F.softmax(end_logits_tea / args.temperature, dim=-1), )
) * (args.temperature ** 2) loss_end = (
loss_fct(
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
F.softmax(end_logits_tea / args.temperature, dim=-1),
)
* (args.temperature ** 2)
)
loss_ce = (loss_start + loss_end) / 2.0 loss_ce = (loss_start + loss_end) / 2.0
loss = args.alpha_ce * loss_ce + args.alpha_squad * loss loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
......
...@@ -118,7 +118,8 @@ def init_gpu_params(params): ...@@ -118,7 +118,8 @@ def init_gpu_params(params):
if params.multi_gpu: if params.multi_gpu:
logger.info("Initializing PyTorch distributed") logger.info("Initializing PyTorch distributed")
torch.distributed.init_process_group( torch.distributed.init_process_group(
init_method="env://", backend="nccl", init_method="env://",
backend="nccl",
) )
......
...@@ -233,7 +233,9 @@ def main(): ...@@ -233,7 +233,9 @@ def main():
eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
if config.model_type == "xlnet": if config.model_type == "xlnet":
data_collator = DataCollatorForPermutationLanguageModeling( data_collator = DataCollatorForPermutationLanguageModeling(
tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, tokenizer=tokenizer,
plm_probability=data_args.plm_probability,
max_span_length=data_args.max_span_length,
) )
else: else:
data_collator = DataCollatorForLanguageModeling( data_collator = DataCollatorForLanguageModeling(
......
...@@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule): ...@@ -226,10 +226,14 @@ class BaseTransformer(pl.LightningModule):
help="Decoder layer dropout probability (Optional). Goes into model.config", help="Decoder layer dropout probability (Optional). Goes into model.config",
) )
parser.add_argument( parser.add_argument(
"--dropout", type=float, help="Dropout probability (Optional). Goes into model.config", "--dropout",
type=float,
help="Dropout probability (Optional). Goes into model.config",
) )
parser.add_argument( parser.add_argument(
"--attention_dropout", type=float, help="Attention dropout probability (Optional). Goes into model.config", "--attention_dropout",
type=float,
help="Attention dropout probability (Optional). Goes into model.config",
) )
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument( parser.add_argument(
......
...@@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10): ...@@ -95,7 +95,10 @@ def make_support(question, source="wiki40b", method="dense", n_results=10):
) )
else: else:
support_doc, hit_lst = query_es_index( support_doc, hit_lst = query_es_index(
question, es_client, index_name="english_wiki40b_snippets_100w", n_results=n_results, question,
es_client,
index_name="english_wiki40b_snippets_100w",
n_results=n_results,
) )
support_list = [ support_list = [
(res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst (res["article_title"], res["section_title"].strip(), res["score"], res["passage_text"]) for res in hit_lst
...@@ -154,7 +157,8 @@ header_full = """ ...@@ -154,7 +157,8 @@ header_full = """
header_html, header_html,
) )
st.sidebar.markdown( st.sidebar.markdown(
header_full, unsafe_allow_html=True, header_full,
unsafe_allow_html=True,
) )
# Long Form QA with ELI5 and Wikipedia # Long Form QA with ELI5 and Wikipedia
...@@ -173,9 +177,17 @@ action_list = [ ...@@ -173,9 +177,17 @@ action_list = [
] ]
demo_options = st.sidebar.checkbox("Demo options") demo_options = st.sidebar.checkbox("Demo options")
if demo_options: if demo_options:
action_st = st.sidebar.selectbox("", action_list, index=3,) action_st = st.sidebar.selectbox(
"",
action_list,
index=3,
)
action = action_list.index(action_st) action = action_list.index(action_st)
show_type = st.sidebar.selectbox("", ["Show full text of passages", "Show passage section titles"], index=0,) show_type = st.sidebar.selectbox(
"",
["Show full text of passages", "Show passage section titles"],
index=0,
)
show_passages = show_type == "Show full text of passages" show_passages = show_type == "Show full text of passages"
else: else:
action = 3 action = 3
...@@ -250,7 +262,9 @@ questions_list = [ ...@@ -250,7 +262,9 @@ questions_list = [
"How does New Zealand have so many large bird predators?", "How does New Zealand have so many large bird predators?",
] ]
question_s = st.selectbox( question_s = st.selectbox(
"What would you like to ask? ---- select <MY QUESTION> to enter a new query", questions_list, index=1, "What would you like to ask? ---- select <MY QUESTION> to enter a new query",
questions_list,
index=1,
) )
if question_s == "<MY QUESTION>": if question_s == "<MY QUESTION>":
question = st.text_input("Enter your question here:", "") question = st.text_input("Enter your question here:", "")
......
...@@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki ...@@ -48,7 +48,11 @@ def make_es_index_snippets(es_client, passages_dset, index_name="english_wiki_ki
yield passage yield passage
# create the ES index # create the ES index
for ok, action in streaming_bulk(client=es_client, index=index_name, actions=passage_generator(),): for ok, action in streaming_bulk(
client=es_client,
index=index_name,
actions=passage_generator(),
):
progress.update(1) progress.update(1)
successes += ok successes += ok
print("Indexed %d documents" % (successes,)) print("Indexed %d documents" % (successes,))
...@@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module): ...@@ -137,7 +141,11 @@ class RetrievalQAEmbedder(torch.nn.Module):
# define function for checkpointing # define function for checkpointing
def partial_encode(*inputs): def partial_encode(*inputs):
encoder_outputs = self.sent_encoder.encoder(inputs[0], attention_mask=inputs[1], head_mask=head_mask,) encoder_outputs = self.sent_encoder.encoder(
inputs[0],
attention_mask=inputs[1],
head_mask=head_mask,
)
sequence_output = encoder_outputs[0] sequence_output = encoder_outputs[0]
pooled_output = self.sent_encoder.pooler(sequence_output) pooled_output = self.sent_encoder.pooler(sequence_output)
return pooled_output return pooled_output
...@@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar ...@@ -234,7 +242,11 @@ def train_qa_retriever_epoch(model, dataset, tokenizer, optimizer, scheduler, ar
if step % args.print_freq == 0 or step == 1: if step % args.print_freq == 0 or step == 1:
print( print(
"{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format( "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time, e,
step,
len(dataset) // args.batch_size,
loc_loss / loc_steps,
time() - st_time,
) )
) )
loc_loss = 0 loc_loss = 0
...@@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc ...@@ -273,7 +285,11 @@ def train_qa_retriever_joint_epoch(model, dataset_list, tokenizer, optimizer, sc
if step % args.print_freq == 0: if step % args.print_freq == 0:
print( print(
"{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format( "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
e, step, len(dataset_list[0]) // args.batch_size, loc_loss / loc_steps, time() - st_time, e,
step,
len(dataset_list[0]) // args.batch_size,
loc_loss / loc_steps,
time() - st_time,
) )
) )
loc_loss = 0 loc_loss = 0
...@@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset): ...@@ -354,7 +370,8 @@ class ELI5DatasetS2S(Dataset):
self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"])) self.document_cache[q_id] = self.document_cache.get(q_id, self.make_doc_function(example["title"]))
document = self.document_cache[q_id] document = self.document_cache[q_id]
in_st = "question: {} context: {}".format( in_st = "question: {} context: {}".format(
question.lower().replace(" --t--", "").strip(), document.lower().strip(), question.lower().replace(" --t--", "").strip(),
document.lower().strip(),
) )
out_st = answer out_st = answer
return (in_st, out_st) return (in_st, out_st)
...@@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e= ...@@ -427,7 +444,11 @@ def train_qa_s2s_epoch(model, dataset, tokenizer, optimizer, scheduler, args, e=
if step % args.print_freq == 0 or step == 1: if step % args.print_freq == 0 or step == 1:
print( print(
"{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format( "{:2d} {:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
e, step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time, e,
step,
len(dataset) // args.batch_size,
loc_loss / loc_steps,
time() - st_time,
) )
) )
loc_loss = 0 loc_loss = 0
...@@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args): ...@@ -456,10 +477,18 @@ def eval_qa_s2s_epoch(model, dataset, tokenizer, args):
if step % args.print_freq == 0: if step % args.print_freq == 0:
print( print(
"{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format( "{:5d} of {:5d} \t L: {:.3f} \t -- {:.3f}".format(
step, len(dataset) // args.batch_size, loc_loss / loc_steps, time() - st_time, step,
len(dataset) // args.batch_size,
loc_loss / loc_steps,
time() - st_time,
) )
) )
print("Total \t L: {:.3f} \t -- {:.3f}".format(loc_loss / loc_steps, time() - st_time,)) print(
"Total \t L: {:.3f} \t -- {:.3f}".format(
loc_loss / loc_steps,
time() - st_time,
)
)
def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args): def train_qa_s2s(qa_s2s_model, qa_s2s_tokenizer, s2s_train_dset, s2s_valid_dset, s2s_args):
...@@ -506,7 +535,12 @@ def qa_s2s_generate( ...@@ -506,7 +535,12 @@ def qa_s2s_generate(
max_input_length=512, max_input_length=512,
device="cuda:0", device="cuda:0",
): ):
model_inputs = make_qa_s2s_batch([(question_doc, "A")], qa_s2s_tokenizer, max_input_length, device=device,) model_inputs = make_qa_s2s_batch(
[(question_doc, "A")],
qa_s2s_tokenizer,
max_input_length,
device=device,
)
n_beams = num_answers if num_beams is None else max(num_beams, num_answers) n_beams = num_answers if num_beams is None else max(num_beams, num_answers)
generated_ids = qa_s2s_model.generate( generated_ids = qa_s2s_model.generate(
input_ids=model_inputs["input_ids"], input_ids=model_inputs["input_ids"],
......
...@@ -37,8 +37,7 @@ logger = logging.getLogger(__name__) ...@@ -37,8 +37,7 @@ logger = logging.getLogger(__name__)
class BertEmbeddings(nn.Module): class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings. """Construct the embeddings from word, position and token_type embeddings."""
"""
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -385,8 +384,8 @@ class BertPooler(nn.Module): ...@@ -385,8 +384,8 @@ class BertPooler(nn.Module):
class MaskedBertPreTrainedModel(PreTrainedModel): class MaskedBertPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = MaskedBertConfig config_class = MaskedBertConfig
...@@ -492,9 +491,9 @@ class MaskedBertModel(MaskedBertPreTrainedModel): ...@@ -492,9 +491,9 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
self.embeddings.word_embeddings = value self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel See base class PreTrainedModel
""" """
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads) self.encoder.layer[layer].attention.prune_heads(heads)
...@@ -685,31 +684,31 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel): ...@@ -685,31 +684,31 @@ class MaskedBertForSequenceClassification(MaskedBertPreTrainedModel):
threshold=None, threshold=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
threshold (:obj:`float`): threshold (:obj:`float`):
Threshold value (see :class:`~emmental.MaskedLinear`). Threshold value (see :class:`~emmental.MaskedLinear`).
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Returns: Hidden-states of the model at the output of each layer plus the initial embedding outputs.
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs: attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
Classification (or regression if config.num_labels==1) loss. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax). Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): heads.
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
outputs = self.bert( outputs = self.bert(
...@@ -770,32 +769,32 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel): ...@@ -770,32 +769,32 @@ class MaskedBertForMultipleChoice(MaskedBertPreTrainedModel):
threshold=None, threshold=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
threshold (:obj:`float`): threshold (:obj:`float`):
Threshold value (see :class:`~emmental.MaskedLinear`). Threshold value (see :class:`~emmental.MaskedLinear`).
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss. Classification loss.
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
`num_choices` is the second dimension of the input tensors. (see `input_ids` above). `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax). Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`. of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs. Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads. heads.
""" """
num_choices = input_ids.shape[1] num_choices = input_ids.shape[1]
...@@ -860,29 +859,29 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel): ...@@ -860,29 +859,29 @@ class MaskedBertForTokenClassification(MaskedBertPreTrainedModel):
threshold=None, threshold=None,
): ):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the token classification loss. Labels for computing the token classification loss.
Indices should be in ``[0, ..., config.num_labels - 1]``. Indices should be in ``[0, ..., config.num_labels - 1]``.
threshold (:obj:`float`): threshold (:obj:`float`):
Threshold value (see :class:`~emmental.MaskedLinear`). Threshold value (see :class:`~emmental.MaskedLinear`).
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
Classification loss.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Returns: Hidden-states of the model at the output of each layer plus the initial embedding outputs.
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs: attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
Classification loss. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
Classification scores (before SoftMax). Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): heads.
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
outputs = self.bert( outputs = self.bert(
...@@ -947,36 +946,36 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel): ...@@ -947,36 +946,36 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
threshold=None, threshold=None,
): ):
r""" r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the start of the labelled span for computing the token classification loss. Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for position (index) of the end of the labelled span for computing the token classification loss. Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`). Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss. Position outside of the sequence are not taken into account for computing the loss.
threshold (:obj:`float`): threshold (:obj:`float`):
Threshold value (see :class:`~emmental.MaskedLinear`). Threshold value (see :class:`~emmental.MaskedLinear`).
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax).
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Returns: Hidden-states of the model at the output of each layer plus the initial embedding outputs.
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~emmental.MaskedBertConfig`) and inputs: attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
Span-start scores (before SoftMax). Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): heads.
Span-end scores (before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
""" """
outputs = self.bert( outputs = self.bert(
...@@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel): ...@@ -996,7 +995,10 @@ class MaskedBertForQuestionAnswering(MaskedBertPreTrainedModel):
start_logits = start_logits.squeeze(-1) start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1) end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,) + outputs[2:] outputs = (
start_logits,
end_logits,
) + outputs[2:]
if start_positions is not None and end_positions is not None: if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension # If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1: if len(start_positions.size()) > 1:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment