"...wan/git@developer.sourcefind.cn:Wenxuan/LightX2V.git" did not exist on "47b3ce2f0e417b9ecbdca450e8752e9fcf39963c"
Unverified Commit 90dcd8c0 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into generative-finetuning

parents d6bbcbc4 e00b4ff1
...@@ -647,12 +647,16 @@ XLNET_START_DOCSTRING = r""" The XLNet model was proposed in ...@@ -647,12 +647,16 @@ XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
Parameters: Parameters:
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model. config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
""" """
XLNET_INPUTS_DOCSTRING = r""" XLNET_INPUTS_DOCSTRING = r"""
Inputs: Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
XLNet is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`. Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
import shutil import shutil
import json import json
import random import random
import uuid
import unittest import unittest
import logging import logging
...@@ -527,7 +528,7 @@ class ConfigTester(object): ...@@ -527,7 +528,7 @@ class ConfigTester(object):
def create_and_test_config_to_json_file(self): def create_and_test_config_to_json_file(self):
config_first = self.config_class(**self.inputs_dict) config_first = self.config_class(**self.inputs_dict)
json_file_path = "/tmp/config.json" json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
config_first.to_json_file(json_file_path) config_first.to_json_file(json_file_path)
config_second = self.config_class.from_json_file(json_file_path) config_second = self.config_class.from_json_file(json_file_path)
os.remove(json_file_path) os.remove(json_file_path)
......
...@@ -187,6 +187,8 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -187,6 +187,8 @@ class BertTokenizer(PreTrainedTokenizer):
index = 0 index = 0
if os.path.isdir(vocab_path): if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
else:
vocab_file = vocab_path
with open(vocab_file, "w", encoding="utf-8") as writer: with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index: if index != token_index:
......
...@@ -45,17 +45,20 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -45,17 +45,20 @@ PRETRAINED_VOCAB_FILES_MAP = {
{ {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
}, },
'merges_file': 'merges_file':
{ {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
}, },
} }
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2': 1024, 'gpt2': 1024,
'gpt2-medium': 1024, 'gpt2-medium': 1024,
'gpt2-large': 1024,
} }
@lru_cache() @lru_cache()
......
...@@ -89,8 +89,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -89,8 +89,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
try: try:
import ftfy import ftfy
import spacy from spacy.lang.en import English
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) _nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
......
...@@ -193,6 +193,13 @@ class PreTrainedTokenizer(object): ...@@ -193,6 +193,13 @@ class PreTrainedTokenizer(object):
cache_dir: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
...@@ -223,6 +230,8 @@ class PreTrainedTokenizer(object): ...@@ -223,6 +230,8 @@ class PreTrainedTokenizer(object):
@classmethod @classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
s3_models = list(cls.max_model_input_sizes.keys()) s3_models = list(cls.max_model_input_sizes.keys())
vocab_files = {} vocab_files = {}
...@@ -283,7 +292,7 @@ class PreTrainedTokenizer(object): ...@@ -283,7 +292,7 @@ class PreTrainedTokenizer(object):
if file_path is None: if file_path is None:
resolved_vocab_files[file_id] = None resolved_vocab_files[file_id] = None
else: else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir) resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
logger.error("Couldn't reach server to download vocabulary.") logger.error("Couldn't reach server to download vocabulary.")
...@@ -477,15 +486,45 @@ class PreTrainedTokenizer(object): ...@@ -477,15 +486,45 @@ class PreTrainedTokenizer(object):
Take care of added tokens. Take care of added tokens.
""" """
def split_on_token(tok, text):
result = []
split_text = text.split(tok)
for i, sub_text in enumerate(split_text):
sub_text = sub_text.strip()
if i == 0 and not sub_text:
result += [tok]
elif i == len(split_text) - 1:
if sub_text:
result += [sub_text]
else:
pass
else:
if sub_text:
result += [sub_text]
result += [tok]
return result
def split_on_tokens(tok_list, text): def split_on_tokens(tok_list, text):
if not text: if not text:
return [] return []
if not tok_list: if not tok_list:
return self._tokenize(text, **kwargs) return self._tokenize(text, **kwargs)
tok = tok_list[0]
split_text = text.split(tok) tokenized_text = []
return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \ text_list = [text]
for sub_text in split_text), [])[:-1] for tok in tok_list:
tokenized_text = []
for sub_text in text_list:
if sub_text not in self.added_tokens_encoder \
and sub_text not in self.all_special_tokens:
tokenized_text += split_on_token(tok, sub_text)
else:
tokenized_text += [sub_text]
text_list = tokenized_text
return sum((self._tokenize(token, **kwargs) if token not \
in self.added_tokens_encoder and token not in self.all_special_tokens \
else [token] for token in tokenized_text), [])
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
tokenized_text = split_on_tokens(added_tokens, text) tokenized_text = split_on_tokens(added_tokens, text)
......
...@@ -124,8 +124,9 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -124,8 +124,9 @@ class XLMTokenizer(PreTrainedTokenizer):
**kwargs) **kwargs)
try: try:
import ftfy import ftfy
import spacy from spacy.lang.en import English
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) _nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment