Commit 8cba0572 authored by LysandreJik's avatar LysandreJik
Browse files

Doc + remove artefacts

parent 6393261e
...@@ -724,9 +724,8 @@ class PreTrainedTokenizer(object): ...@@ -724,9 +724,8 @@ class PreTrainedTokenizer(object):
def encode_plus(self, text, text_pair=None, add_special_tokens=False, output_mask=False, max_length=None, **kwargs): def encode_plus(self, text, text_pair=None, add_special_tokens=False, output_mask=False, max_length=None, **kwargs):
""" """
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args: Args:
text: The first sequence to be encoded. text: The first sequence to be encoded.
...@@ -801,42 +800,6 @@ class PreTrainedTokenizer(object): ...@@ -801,42 +800,6 @@ class PreTrainedTokenizer(object):
return information return information
if text_pair is None:
if add_special_tokens:
sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
if max_length:
sequence_tokens = sequence_tokens[:max_length - self.num_added_tokens()]
return self.add_special_tokens_single_sentence(sequence_tokens)
else:
ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
return ids[:max_length] if max_length != -1 else ids
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]
if add_special_tokens:
if max_length:
if len(first_sentence_tokens) + self.num_added_tokens(pair=True) >= max_length:
logger.warning(
"The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
else:
if len(second_sentence_tokens) + len(first_sentence_tokens) + self.num_added_tokens(
pair=True) > max_length:
second_sentence_tokens = second_sentence_tokens[
:max_length - len(first_sentence_tokens) - self.num_added_tokens(
pair=True)]
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens,
output_mask)
else:
if max_length:
first_sentence_tokens = first_sentence_tokens[:max_length]
second_sentence_tokens = second_sentence_tokens[:max_length]
if output_mask:
logger.warning("Can't output mask if you're not joining two sequences.")
return first_sentence_tokens, second_sentence_tokens
def add_special_tokens_single_sentence(self, token_ids): def add_special_tokens_single_sentence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids return token_ids
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment