"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "6247d1b2b6a3aea7c1484d9c6560a5795d35b7e0"
Commit e384ae2b authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'huggingface/master'

merge huggingface/master to update
parents b219029c d8923270
...@@ -180,9 +180,10 @@ class PreTrainedTokenizer(object): ...@@ -180,9 +180,10 @@ class PreTrainedTokenizer(object):
@classmethod @classmethod
def from_pretrained(cls, *inputs, **kwargs): def from_pretrained(cls, *inputs, **kwargs):
r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. r"""
Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Parameters: Args:
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
...@@ -383,14 +384,15 @@ class PreTrainedTokenizer(object): ...@@ -383,14 +384,15 @@ class PreTrainedTokenizer(object):
def add_tokens(self, new_tokens): def add_tokens(self, new_tokens):
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the """
Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to it with indices starting from length of the current vocabulary. vocabulary, they are added to it with indices starting from length of the current vocabulary.
Parameters: Args:
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns: Returns:
Number of tokens added to the vocabulary. Number of tokens added to the vocabulary.
Examples:: Examples::
...@@ -422,17 +424,20 @@ class PreTrainedTokenizer(object): ...@@ -422,17 +424,20 @@ class PreTrainedTokenizer(object):
def add_special_tokens(self, special_tokens_dict): def add_special_tokens(self, special_tokens_dict):
""" Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them """
to class attributes. If special tokens are NOT in the vocabulary, they are added Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
to it (indexed starting from the last index of the current vocabulary). to class attributes. If special tokens are NOT in the vocabulary, they are added
to it (indexed starting from the last index of the current vocabulary).
Args:
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
``additional_special_tokens``].
Parameters: Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns: Returns:
Number of tokens added to the vocabulary. Number of tokens added to the vocabulary.
Examples:: Examples::
...@@ -519,14 +524,37 @@ class PreTrainedTokenizer(object): ...@@ -519,14 +524,37 @@ class PreTrainedTokenizer(object):
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
raise NotImplementedError raise NotImplementedError
def encode(self, text, text_pair=None, add_special_tokens=False):
def encode(self, text): """
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
text: The first sequence to be encoded.
text_pair: Optional second sequence to be encoded.
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
""" """
return self.convert_tokens_to_ids(self.tokenize(text)) if text_pair is None:
if add_special_tokens:
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
else:
return self.convert_tokens_to_ids(self.tokenize(text))
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
if add_special_tokens:
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
else:
return first_sentence_tokens, second_sentence_tokens
def add_special_tokens_single_sentence(self, token_ids):
raise NotImplementedError
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
raise NotImplementedError
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
...@@ -561,16 +589,28 @@ class PreTrainedTokenizer(object): ...@@ -561,16 +589,28 @@ class PreTrainedTokenizer(object):
return ' '.join(self.convert_ids_to_tokens(tokens)) return ' '.join(self.convert_ids_to_tokens(tokens))
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary """
with options to remove special tokens and clean up tokenization spaces. Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
""" """
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
text = self.convert_tokens_to_string(filtered_tokens) text = self.convert_tokens_to_string(filtered_tokens)
if clean_up_tokenization_spaces:
text = self.clean_up_tokenization(text) if self.sep_token is not None and self.sep_token in text:
return text text = text.replace(self.cls_token, self.sep_token)
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token)))
if clean_up_tokenization_spaces:
clean_text = [self.clean_up_tokenization(text) for text in split_text]
return clean_text
else:
return split_text
else:
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
@property @property
def special_tokens_map(self): def special_tokens_map(self):
...@@ -602,7 +642,7 @@ class PreTrainedTokenizer(object): ...@@ -602,7 +642,7 @@ class PreTrainedTokenizer(object):
class attributes (cls_token, unk_token...). class attributes (cls_token, unk_token...).
""" """
all_toks = self.all_special_tokens all_toks = self.all_special_tokens
all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks) all_ids = list(self._convert_token_to_id(t) for t in all_toks)
return all_ids return all_ids
@staticmethod @staticmethod
......
...@@ -214,6 +214,22 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -214,6 +214,22 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip() out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string return out_string
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLM sequence has the following format: [CLS] X [SEP]
"""
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
......
...@@ -177,6 +177,24 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -177,6 +177,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string return out_string
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return token_ids + sep + cls
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return token_ids_0 + sep + token_ids_1 + sep + cls
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """ Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory. to a directory.
......
...@@ -38,10 +38,10 @@ from setuptools import find_packages, setup ...@@ -38,10 +38,10 @@ from setuptools import find_packages, setup
setup( setup(
name="pytorch_transformers", name="pytorch_transformers",
version="1.0.0", version="1.1.0",
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM", description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
long_description=open("README.md", "r", encoding='utf-8').read(), long_description=open("README.md", "r", encoding='utf-8').read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU', keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment