"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "d6b8e9cec7301ba02f642588a6f12e78ec3b9798"
Unverified Commit 110394b2 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into t5

parents 33e72b08 7296f101
...@@ -23,7 +23,7 @@ import logging ...@@ -23,7 +23,7 @@ import logging
from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
from .utils import slow from .utils import slow, SMALL_MODEL_IDENTIFIER
class AutoTokenizerTest(unittest.TestCase): class AutoTokenizerTest(unittest.TestCase):
...@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase): ...@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
self.assertIsInstance(tokenizer, GPT2Tokenizer) self.assertIsInstance(tokenizer, GPT2Tokenizer)
self.assertGreater(len(tokenizer), 0) self.assertGreater(len(tokenizer), 0)
def test_tokenizer_from_pretrained_identifier(self):
logging.basicConfig(level=logging.INFO)
tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(tokenizer, BertTokenizer)
self.assertEqual(len(tokenizer), 12)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import unittest
from io import open
from transformers.tokenization_bert import WordpieceTokenizer
from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
MecabTokenizer, CharacterTokenizer,
VOCAB_FILES_NAMES)
from .tokenization_tests_commons import CommonTestCases
from .utils import slow, custom_tokenizers
@custom_tokenizers
class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = BertJapaneseTokenizer
def setUp(self):
super(BertJapaneseTokenizationTest, self).setUp()
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self):
input_text = u"こんにちは、世界。 \nこんばんは、世界。"
output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file)
tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
self.assertListEqual(tokens,
[u"こんにちは", u"、", u"世界", u"。",
u"こん", u"##ばんは", u"、", u"世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
[3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_mecab_tokenizer(self):
tokenizer = MecabTokenizer()
self.assertListEqual(
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
[u"アップルストア", u"で", u"iPhone", u"8", u"が",
u"発売", u"さ", u"れ", u"た", u"。"])
def test_mecab_tokenizer_lower(self):
tokenizer = MecabTokenizer(do_lower_case=True)
self.assertListEqual(
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
[u"アップルストア", u"で", u"iphone", u"8", u"が",
u"発売", u"さ", u"れ", u"た", u"。"])
def test_mecab_tokenizer_no_normalize(self):
tokenizer = MecabTokenizer(normalize_text=False)
self.assertListEqual(
tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "),
[u"アップルストア", u"で", u"iPhone", u"8", u"が",
u"発売", u"さ", u"れ", u"た", u" ", u"。"])
def test_wordpiece_tokenizer(self):
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
self.assertListEqual(tokenizer.tokenize(u""), [])
self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
[u"こんにちは"])
self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
[u"こん", u"##ばんは"])
self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
[u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = BertJapaneseTokenizer
def setUp(self):
super(BertJapaneseCharacterTokenizationTest, self).setUp()
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
subword_tokenizer_type="character",
**kwargs)
def get_input_output_texts(self):
input_text = u"こんにちは、世界。 \nこんばんは、世界。"
output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file,
subword_tokenizer_type="character")
tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
self.assertListEqual(tokens,
[u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
[3, 4, 5, 6, 7, 11, 9, 10, 12,
3, 4, 8, 4, 7, 11, 9, 10, 12])
def test_character_tokenizer(self):
vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
vocab = {}
for (i, token) in enumerate(vocab_tokens):
vocab[token] = i
tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
self.assertListEqual(tokenizer.tokenize(u""), [])
self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
[u"こ", u"ん", u"に", u"ち", u"は"])
self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
[u"こ", u"ん", u"に", u"ち", u"[UNK]"])
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert encoded_sentence == [2] + text + [3]
assert encoded_pair == [2] + text + [3] + text_2 + [3]
...@@ -6,18 +6,26 @@ from distutils.util import strtobool ...@@ -6,18 +6,26 @@ from distutils.util import strtobool
from transformers.file_utils import _tf_available, _torch_available from transformers.file_utils import _tf_available, _torch_available
try: SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
run_slow = os.environ["RUN_SLOW"]
except KeyError:
# RUN_SLOW isn't set, default to skipping slow tests. def parse_flag_from_env(key, default=False):
_run_slow_tests = False
else:
# RUN_SLOW is set, convert it to True or False.
try: try:
_run_slow_tests = strtobool(run_slow) value = os.environ[key]
except ValueError: except KeyError:
# More values are supported, but let's keep the message simple. # KEY isn't set, default to `default`.
raise ValueError("If set, RUN_SLOW must be yes or no.") _value = default
else:
# KEY is set, convert it to True or False.
try:
_value = strtobool(value)
except ValueError:
# More values are supported, but let's keep the message simple.
raise ValueError("If set, {} must be yes or no.".format(key))
return _value
_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
def slow(test_case): def slow(test_case):
...@@ -33,6 +41,19 @@ def slow(test_case): ...@@ -33,6 +41,19 @@ def slow(test_case):
return test_case return test_case
def custom_tokenizers(test_case):
"""
Decorator marking a test for a custom tokenizer.
Custom tokenizers require additional dependencies, and are skipped
by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
to a truthy value to run them.
"""
if not _run_custom_tokenizers:
test_case = unittest.skip("test of custom tokenizers")(test_case)
return test_case
def require_torch(test_case): def require_torch(test_case):
""" """
Decorator marking a test that requires PyTorch. Decorator marking a test that requires PyTorch.
...@@ -59,6 +80,6 @@ def require_tf(test_case): ...@@ -59,6 +80,6 @@ def require_tf(test_case):
if _torch_available: if _torch_available:
# Set the USE_CUDA environment variable to select a GPU. # Set the USE_CUDA environment variable to select a GPU.
torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu" torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
else: else:
torch_device = None torch_device = None
...@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging import logging
from .tokenization_bert import BertTokenizer from .tokenization_bert import BertTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer
from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_ctrl import CTRLTokenizer from .tokenization_ctrl import CTRLTokenizer
...@@ -75,6 +76,7 @@ class AutoTokenizer(object): ...@@ -75,6 +76,7 @@ class AutoTokenizer(object):
- contains `albert`: AlbertTokenizer (ALBERT model) - contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model) - contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `roberta`: RobertaTokenizer (RoBERTa model) - contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
...@@ -87,6 +89,7 @@ class AutoTokenizer(object): ...@@ -87,6 +89,7 @@ class AutoTokenizer(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
...@@ -109,8 +112,14 @@ class AutoTokenizer(object): ...@@ -109,8 +112,14 @@ class AutoTokenizer(object):
Examples:: Examples::
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. # Download vocabulary from S3 and cache.
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# Download vocabulary from S3 (user-uploaded) and cache.
tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
""" """
if 't5' in pretrained_model_name_or_path: if 't5' in pretrained_model_name_or_path:
...@@ -123,6 +132,8 @@ class AutoTokenizer(object): ...@@ -123,6 +132,8 @@ class AutoTokenizer(object):
return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'roberta' in pretrained_model_name_or_path: elif 'roberta' in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert-base-japanese' in pretrained_model_name_or_path:
return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path: elif 'bert' in pretrained_model_name_or_path:
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path: elif 'openai-gpt' in pretrained_model_name_or_path:
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import six
import unicodedata
from io import open
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'bert-base-japanese': 512,
'bert-base-japanese-whole-word-masking': 512,
'bert-base-japanese-char': 512,
'bert-base-japanese-char-whole-word-masking': 512
}
PRETRAINED_INIT_CONFIGURATION = {
'bert-base-japanese': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'wordpiece'
},
'bert-base-japanese-whole-word-masking':{
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'wordpiece'
},
'bert-base-japanese-char': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'character'
},
'bert-base-japanese-char-whole-word-masking': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'character'
}
}
class BertJapaneseTokenizer(BertTokenizer):
"""BERT tokenizer for Japanese text"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, do_lower_case=False,
do_word_tokenize=True, do_subword_tokenize=True,
word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
never_split=None, unk_token='[UNK]', sep_token='[SEP]',
pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
"""Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True)
Whether to do subword tokenization.
**word_tokenizer_type**: (`optional`) string (default "basic")
Type of word tokenizer.
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer.
"""
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_word_tokenize = do_word_tokenize
if do_word_tokenize:
if word_tokenizer_type == 'basic':
self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=False)
elif word_tokenizer_type == 'mecab':
self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
else:
raise ValueError(
"Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
self.do_subword_tokenize = do_subword_tokenize
if do_subword_tokenize:
if subword_tokenizer_type == 'wordpiece':
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
elif subword_tokenizer_type == 'character':
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
else:
raise ValueError(
"Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
def _tokenize(self, text):
if self.do_word_tokenize:
tokens = self.word_tokenizer.tokenize(text,
never_split=self.all_special_tokens)
else:
tokens = [text]
if self.do_subword_tokenize:
split_tokens = [sub_token for token in tokens
for sub_token in self.subword_tokenizer.tokenize(token)]
else:
split_tokens = tokens
return split_tokens
class MecabTokenizer(object):
"""Runs basic tokenization with MeCab morphological parser."""
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
import MeCab
self.mecab = MeCab.Tagger()
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize('NFKC', text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
if six.PY2:
mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
else:
mecab_output = self.mecab.parse(text)
cursor = 0
for line in mecab_output.split('\n'):
if line == 'EOS':
break
token, _ = line.split('\t')
token_start = text.index(token, cursor)
token_end = token_start + len(token)
if self.do_lower_case and token not in never_split:
token = token.lower()
tokens.append(token)
cursor = token_end
return tokens
class CharacterTokenizer(object):
"""Runs Character tokenziation."""
def __init__(self, vocab, unk_token, normalize_text=True):
"""Constructs a CharacterTokenizer.
Args:
**vocab**:
Vocabulary object.
**unk_token**: str
A special symbol for out-of-vocabulary token.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self.vocab = vocab
self.unk_token = unk_token
self.normalize_text = normalize_text
def tokenize(self, text):
"""Tokenizes a piece of text into characters.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args:
text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`.
Returns:
A list of characters.
"""
if self.normalize_text:
text = unicodedata.normalize('NFKC', text)
output_tokens = []
for i, char in enumerate(text):
if char not in self.vocab:
output_tokens.append(self.unk_token)
continue
output_tokens.append(char)
return output_tokens
...@@ -25,7 +25,7 @@ import itertools ...@@ -25,7 +25,7 @@ import itertools
import re import re
from io import open from io import open
from .file_utils import cached_path, is_tf_available, is_torch_available from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
if is_tf_available(): if is_tf_available():
import tensorflow as tf import tensorflow as tf
...@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object): ...@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object):
self.max_len = max_len if max_len is not None else int(1e12) self.max_len = max_len if max_len is not None else int(1e12)
# Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed. # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
self.padding_side = kwargs.pop('padding_side', self.padding_side) self.padding_side = kwargs.pop('padding_side', self.padding_side)
# Added tokens # Added tokens
...@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object): ...@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
pretrained_model_name_or_path: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
...@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object): ...@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
# Download vocabulary from S3 and cache. # Download vocabulary from S3 and cache.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Download vocabulary from S3 (user-uploaded) and cache.
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
...@@ -327,12 +331,15 @@ class PreTrainedTokenizer(object): ...@@ -327,12 +331,15 @@ class PreTrainedTokenizer(object):
if os.path.isdir(pretrained_model_name_or_path): if os.path.isdir(pretrained_model_name_or_path):
# If a directory is provided we look for the standard filenames # If a directory is provided we look for the standard filenames
full_file_name = os.path.join(pretrained_model_name_or_path, file_name) full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
else: if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name = pretrained_model_name_or_path full_file_name = pretrained_model_name_or_path
if not os.path.exists(full_file_name): else:
logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
full_file_name = None
vocab_files[file_id] = full_file_name vocab_files[file_id] = full_file_name
# Look for the additional tokens files # Look for the additional tokens files
...@@ -628,7 +635,6 @@ class PreTrainedTokenizer(object): ...@@ -628,7 +635,6 @@ class PreTrainedTokenizer(object):
Take care of added tokens. Take care of added tokens.
text: The sequence to be encoded. text: The sequence to be encoded.
return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
**kwargs: passed to the child `self.tokenize()` method **kwargs: passed to the child `self.tokenize()` method
""" """
def lowercase_text(t): def lowercase_text(t):
...@@ -663,7 +669,7 @@ class PreTrainedTokenizer(object): ...@@ -663,7 +669,7 @@ class PreTrainedTokenizer(object):
return result return result
def split_on_tokens(tok_list, text): def split_on_tokens(tok_list, text):
if not text: if not text.strip():
return [] return []
if not tok_list: if not tok_list:
return self._tokenize(text, **kwargs) return self._tokenize(text, **kwargs)
...@@ -917,7 +923,7 @@ class PreTrainedTokenizer(object): ...@@ -917,7 +923,7 @@ class PreTrainedTokenizer(object):
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_attention_mask: (optional) Set to False to avoir returning attention mask (default True) return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
...@@ -962,24 +968,13 @@ class PreTrainedTokenizer(object): ...@@ -962,24 +968,13 @@ class PreTrainedTokenizer(object):
if add_special_tokens: if add_special_tokens:
sequence = self.build_inputs_with_special_tokens(ids, pair_ids) sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
if return_special_tokens_mask: if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
# Prepare inputs as tensors if asked
if return_tensors == 'tf' and is_tf_available():
sequence = tf.constant([sequence])
token_type_ids = tf.constant([token_type_ids])
elif return_tensors == 'pt' and is_torch_available():
sequence = torch.tensor([sequence])
token_type_ids = torch.tensor([token_type_ids])
elif return_tensors is not None:
logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
encoded_inputs["input_ids"] = sequence encoded_inputs["input_ids"] = sequence
if return_token_type_ids: if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids encoded_inputs["token_type_ids"] = token_type_ids
...@@ -1003,7 +998,7 @@ class PreTrainedTokenizer(object): ...@@ -1003,7 +998,7 @@ class PreTrainedTokenizer(object):
) )
if pad_to_max_length and max_length is None and self.max_len > 10000: if pad_to_max_length and max_length is None and self.max_len > 10000:
logger.warning("Sequence can't be padded as the maximum ") logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
if needs_to_be_padded: if needs_to_be_padded:
difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"]) difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
...@@ -1016,10 +1011,9 @@ class PreTrainedTokenizer(object): ...@@ -1016,10 +1011,9 @@ class PreTrainedTokenizer(object):
if return_special_tokens_mask: if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
elif self.padding_side == 'left': elif self.padding_side == 'left':
if return_attention_mask: if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
if return_token_type_ids: if return_token_type_ids:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"] encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
if return_special_tokens_mask: if return_special_tokens_mask:
...@@ -1031,7 +1025,26 @@ class PreTrainedTokenizer(object): ...@@ -1031,7 +1025,26 @@ class PreTrainedTokenizer(object):
elif return_attention_mask: elif return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
# Prepare inputs as tensors if asked
if return_tensors == 'tf' and is_tf_available():
encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
elif return_tensors == 'pt' and is_torch_available():
encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
elif return_tensors is not None:
logger.warning(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
return_tensors))
return encoded_inputs return encoded_inputs
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
......
...@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs) **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
# cache of sm.MosesPunctNormalizer instance # cache of sm.MosesPunctNormalizer instance
self.cache_moses_punct_normalizer = dict() self.cache_moses_punct_normalizer = dict()
# cache of sm.MosesTokenizer instance # cache of sm.MosesTokenizer instance
......
""" Link tester.
This little utility reads all the python files in the repository,
scans for links pointing to S3 and tests the links one by one. Raises an error
at the end of the scan if at least one link was reported broken.
"""
import os
import re
import sys
import requests
REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
def list_python_files_in_repository():
""" List all python files in the repository.
This function assumes that the script is executed in the root folder.
"""
source_code_files = []
for path, subdirs, files in os.walk("."):
if "templates" in path:
continue
for name in files:
if ".py" in name and ".pyc" not in name:
path_to_files = os.path.join(path, name)
source_code_files.append(path_to_files)
return source_code_files
def find_all_links(file_paths):
links = []
for path in file_paths:
links += scan_code_for_links(path)
return links
def scan_code_for_links(source):
""" Scans the file to find links using a regular expression.
Returns a list of links.
"""
with open(source, 'r') as content:
content = content.read()
raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
links = [prefix + suffix for _, prefix, suffix in raw_links]
return links
def check_all_links(links):
""" Check that the provided links are valid.
Links are considered valid if a HEAD request to the server
returns a 200 status code.
"""
broken_links = []
for link in links:
head = requests.head(link)
if head.status_code != 200:
broken_links.append(link)
return broken_links
if __name__ == "__main__":
file_paths = list_python_files_in_repository()
links = find_all_links(file_paths)
broken_links = check_all_links(links)
print("Looking for broken links to pre-trained models/configs/tokenizers...")
if broken_links:
print("The following links did not respond:")
for link in broken_links:
print("- {}".format(link))
sys.exit(1)
print("All links are ok.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment