Doc + remove artefacts

8cba0572 · LysandreJik · 6393261e · 8cba0572
Commit 8cba0572 authored Sep 19, 2019 by LysandreJik
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 39 deletions

pytorch_transformers/tokenization_utils.py pytorch_transformers/tokenization_utils.py +2 -39

No files found.
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -724,9 +724,8 @@ class PreTrainedTokenizer(object):

    def encode_plus(self, text, text_pair=None, add_special_tokens=False, output_mask=False, max_length=None, **kwargs):
        """
-        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-
-        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+        Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
+        method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.

        Args:
            text: The first sequence to be encoded.
@@ -801,42 +800,6 @@ class PreTrainedTokenizer(object):

        return information

-        if text_pair is None:
-            if add_special_tokens:
-                sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
-                if max_length:
-                    sequence_tokens = sequence_tokens[:max_length - self.num_added_tokens()]
-                return self.add_special_tokens_single_sentence(sequence_tokens)
-            else:
-                ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
-                return ids[:max_length] if max_length != -1 else ids
-
-        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
-
-        if add_special_tokens:
-            if max_length:
-                if len(first_sentence_tokens) + self.num_added_tokens(pair=True) >= max_length:
-                    logger.warning(
-                        "The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
-                else:
-                    if len(second_sentence_tokens) + len(first_sentence_tokens) + self.num_added_tokens(
-                            pair=True) > max_length:
-                        second_sentence_tokens = second_sentence_tokens[
-                                                 :max_length - len(first_sentence_tokens) - self.num_added_tokens(
-                                                     pair=True)]
-
-            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens,
-                                                          output_mask)
-        else:
-            if max_length:
-                first_sentence_tokens = first_sentence_tokens[:max_length]
-                second_sentence_tokens = second_sentence_tokens[:max_length]
-
-            if output_mask:
-                logger.warning("Can't output mask if you're not joining two sequences.")
-            return first_sentence_tokens, second_sentence_tokens
-
    def add_special_tokens_single_sentence(self, token_ids):
        logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
        return token_ids