Merge branch 'master' into saving-and-resuming

518ba748 · Thomas Wolf · GitHub · eeb70cdd · 18601c3b · 518ba748
Unverified Commit 518ba748 authored Dec 21, 2019 by Thomas Wolf Committed by GitHub Dec 21, 2019
7 changed files
--- a/README.md
+++ b/README.md
@@ -133,7 +133,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training

 ## Model architectures

-🤗 Transformers currently provides 10 NLU/NLG architectures:
+🤗 Transformers currently provides the following NLU/NLG architectures:

 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -40,6 +40,7 @@ from tqdm import tqdm, trange

 from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForQuestionAnswering, BertTokenizer,
+                                  RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig,
                                  XLMConfig, XLMForQuestionAnswering,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
@@ -53,11 +54,12 @@ from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_e

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys())
-                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ())

 MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
@@ -191,13 +193,11 @@ def train(args, train_dataset, model, tokenizer):
            inputs = {
                'input_ids':       batch[0],
                'attention_mask':  batch[1],
+                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
                'start_positions': batch[3],
-                'end_positions':   batch[4]
+                'end_positions':   batch[4],
            }

-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[5],
                               'p_mask':       batch[6]})
@@ -315,13 +315,9 @@ def evaluate(args, model, tokenizer, prefix=""):
        with torch.no_grad():
            inputs = {
                'input_ids':      batch[0],
-                'attention_mask': batch[1]
+                'attention_mask': batch[1],
+                'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2],
            }
-
-            if args.model_type != 'distilbert':
-                # XLM don't use segment_ids
-                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
-
            example_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
@@ -449,7 +445,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
-            return_dataset='pt'
+            return_dataset='pt',
+            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
@@ -568,10 +565,10 @@ def main():
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='',
-                        help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='',
-                        help="Can be used for distant debugging.")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+
+    parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features')
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:

--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -106,7 +106,7 @@ if is_torch_available():
                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
                                RobertaForSequenceClassification, RobertaForMultipleChoice,
-                                RobertaForTokenClassification,
+                                RobertaForTokenClassification, RobertaForQuestionAnswering,
                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel,
                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,

--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -377,7 +377,8 @@ def compute_predictions_logits(
    output_null_log_odds_file,
    verbose_logging,
    version_2_with_negative,
-    null_score_diff_threshold
+    null_score_diff_threshold,
+    tokenizer,
 ):
    """Write final predictions to the json file and log-odds of null if needed."""
    logger.info("Writing predictions to: %s" % (output_prediction_file))
@@ -474,11 +475,14 @@ def compute_predictions_logits(
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-                tok_text = " ".join(tok_tokens)

-                # De-tokenize WordPieces that have been split off.
-                tok_text = tok_text.replace(" ##", "")
-                tok_text = tok_text.replace("##", "")
+                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+                # tok_text = " ".join(tok_tokens)
+                #
+                # # De-tokenize WordPieces that have been split off.
+                # tok_text = tok_text.replace(" ##", "")
+                # tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()

--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -4,6 +4,9 @@ import logging
 import os
 import json
 import numpy as np
+from multiprocessing import Pool
+from multiprocessing import cpu_count
+from functools import partial

 from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
 from .utils import DataProcessor, InputExample, InputFeatures
@@ -79,59 +82,20 @@ def _is_whitespace(c):
        return True
    return False

-
-def squad_convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
-):
-    """
-    Converts a list of examples into a list of features that can be directly given as input to a model.
-    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
-
-    Args:
-        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
-        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
-        max_seq_length: The maximum sequence length of the inputs.
-        doc_stride: The stride used when the context is too large and is split across several features.
-        max_query_length: The maximum length of the query.
-        is_training: whether to create features for model evaluation or model training.
-        return_dataset: Default False. Either 'pt' or 'tf'.
-            if 'pt': returns a torch.data.TensorDataset,
-            if 'tf': returns a tf.data.Dataset
-
-    Returns:
-        list of :class:`~transformers.data.processors.squad.SquadFeatures`
-
-    Example::
-
-        processor = SquadV2Processor()
-        examples = processor.get_dev_examples(data_dir)
-
-        features = squad_convert_examples_to_features( 
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-        )
-    """
-
-    # Defining helper methods
-    unique_id = 1000000000
-
+def squad_convert_example_to_features(example, max_seq_length,
+                                       doc_stride, max_query_length, is_training):
    features = []
-    for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
    if is_training and not example.is_impossible:
        # Get start and end position
        start_position = example.start_position
        end_position = example.end_position

        # If the answer cannot be found in the text, then skip this example.
-            actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+        actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
-                continue
+            return []

    tok_to_orig_index = []
    orig_to_tok_index = []
@@ -156,10 +120,9 @@ def squad_convert_examples_to_features(

    spans = []

-        truncated_query = tokenizer.encode(
-            example.question_text, add_special_tokens=False, max_length=max_query_length
-        )
-        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
+    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
+    sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \
+        if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence
    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
@@ -172,18 +135,16 @@ def squad_convert_examples_to_features(
            return_overflowing_tokens=True,
            pad_to_max_length=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
+            truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
        )

-            paragraph_len = min(
-                len(all_doc_tokens) - len(spans) * doc_stride,
-                max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
-            )
+        paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride,
+                            max_seq_length - len(truncated_query) - sequence_pair_added_tokens)

-            if tokenizer.pad_token_id in encoded_dict["input_ids"]:
-                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+        if tokenizer.pad_token_id in encoded_dict['input_ids']:
+            non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
        else:
-                non_padded_ids = encoded_dict["input_ids"]
+            non_padded_ids = encoded_dict['input_ids']

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)

@@ -209,20 +170,17 @@ def squad_convert_examples_to_features(
    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-                index = (
-                    j
-                    if tokenizer.padding_side == "left"
-                    else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
-                )
+            index = j if tokenizer.padding_side == "left" else spans[doc_span_index][
+                                                                   "truncated_query_with_special_tokens_length"] + j
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
-            cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+        cls_index = span['input_ids'].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = np.array(span["token_type_ids"])
+        p_mask = np.array(span['token_type_ids'])

        p_mask = np.minimum(p_mask, 1)

@@ -261,27 +219,88 @@ def squad_convert_examples_to_features(
                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

-            features.append(
-                SquadFeatures(
-                    span["input_ids"],
-                    span["attention_mask"],
-                    span["token_type_ids"],
+        features.append(SquadFeatures(
+            span['input_ids'],
+            span['attention_mask'],
+            span['token_type_ids'],
            cls_index,
            p_mask.tolist(),
-                    example_index=example_index,
-                    unique_id=unique_id,
-                    paragraph_len=span["paragraph_len"],
+            example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing.
+            unique_id=0,
+            paragraph_len=span['paragraph_len'],
            token_is_max_context=span["token_is_max_context"],
            tokens=span["tokens"],
            token_to_orig_map=span["token_to_orig_map"],
+
            start_position=start_position,
-                    end_position=end_position,
-                )
+            end_position=end_position
+        ))
+    return features
+
+def squad_convert_example_to_features_init(tokenizer_for_convert):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                       doc_stride, max_query_length, is_training, 
+                                       return_dataset=False, threads=1):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model.
+    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset,
+            if 'tf': returns a tf.data.Dataset
+        threads: multiple processing threadsa-smi
+
+
+    Returns:
+        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+
+    Example::
+
+        processor = SquadV2Processor()
+        examples = processor.get_dev_examples(data_dir)
+
+        features = squad_convert_examples_to_features( 
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
        )
+    """

+    # Defining helper methods    
+    features = []
+    threads = min(threads, cpu_count())
+    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length,
+                                       doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training)
+        features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features'))
+    new_features = []
+    unique_id = 1000000000
+    example_index = 0
+    for example_features in tqdm(features, total=len(features), desc='add example index and unique id'):
+        if not example_features:
+            continue
+        for example_feature in example_features:
+            example_feature.example_index = example_index
+            example_feature.unique_id = unique_id
+            new_features.append(example_feature)
            unique_id += 1
-
-    if return_dataset == "pt":
+        example_index += 1
+    features = new_features
+    del new_features
+    if return_dataset == 'pt':
        if not is_torch_available():
            raise ImportError("Pytorch must be installed to return a pytorch dataset.")


--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -575,3 +575,89 @@ class RobertaClassificationHead(nn.Module):
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
+
+
+@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class RobertaForQuestionAnswering(BertPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
+        model = RobertaForQuestionAnswering.from_pretrained('roberta-large')
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_ids = tokenizer.encode(question, text)
+        start_scores, end_scores = model(torch.tensor([input_ids]))
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaForQuestionAnswering, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                start_positions=None, end_positions=None):
+
+        outputs = self.roberta(input_ids,
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
\ No newline at end of file
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -231,6 +231,7 @@ class PreTrainedTokenizer(object):
        
        # Added tokens
        self.added_tokens_encoder = {}
+        self.unique_added_tokens_encoder = set()
        self.added_tokens_decoder = {}

        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
@@ -554,6 +555,7 @@ class PreTrainedTokenizer(object):
        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
        added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
        self.added_tokens_encoder.update(added_tok_encoder)
+        self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
        self.added_tokens_decoder.update(added_tok_decoder)

        return len(to_add_tokens)
@@ -631,6 +633,7 @@ class PreTrainedTokenizer(object):

        return added_tokens

+
    def tokenize(self, text, **kwargs):
        """ Converts a string in a sequence of tokens (string), using the tokenizer.
            Split in words for word-based vocabulary or sub-words for sub-word-based
@@ -685,18 +688,17 @@ class PreTrainedTokenizer(object):
            for tok in tok_list:
                tokenized_text = []
                for sub_text in text_list:
-                    if sub_text not in self.added_tokens_encoder \
-                            and sub_text not in all_special_tokens:
+                    if sub_text not in self.unique_added_tokens_encoder:
                        tokenized_text += split_on_token(tok, sub_text)
                    else:
                        tokenized_text += [sub_text]
                text_list = tokenized_text

-            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
-                    in self.added_tokens_encoder and token not in all_special_tokens \
+            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) \
+                    if token not in self.unique_added_tokens_encoder
                    else [token] for token in tokenized_text)))

-        added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens
+        added_tokens = self.unique_added_tokens_encoder
        tokenized_text = split_on_tokens(added_tokens, text)
        return tokenized_text