Commit 40ed7172 authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'refs/remotes/huggingface/master'

parents 86a63070 7296f101
......@@ -21,6 +21,7 @@ from io import open
from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import CommonTestCases
from .utils import slow
class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
......@@ -70,19 +71,20 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer = self.get_tokenizer()
self.assertListEqual(
tokenizer.encode('Hello world!'),
tokenizer.encode('Hello world!', add_special_tokens=False),
[0, 31414, 232, 328, 2]
)
self.assertListEqual(
tokenizer.encode('Hello world! cécé herlolip 418'),
tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False),
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
)
@slow
def test_sequence_builders(self):
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
......
......@@ -79,13 +79,13 @@ class CommonTestCases:
# Now let's start the test
tokenizer = self.get_tokenizer(max_len=42)
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname)
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
self.assertListEqual(before_tokens, after_tokens)
self.assertEqual(tokenizer.max_len, 42)
......@@ -102,14 +102,48 @@ class CommonTestCases:
with TemporaryDirectory() as tmpdirname:
filename = os.path.join(tmpdirname, u"tokenizer.bin")
pickle.dump(tokenizer, open(filename, "wb"))
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
tokenizer_new = pickle.load(open(filename, "rb"))
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
subwords_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(subwords, subwords_loaded)
def test_added_tokens_do_lower_case(self):
tokenizer = self.get_tokenizer(do_lower_case=True)
special_token = tokenizer.all_special_tokens[0]
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD']
added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 2)
toks = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2)
self.assertEqual(len(toks), len(toks2))
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
self.assertListEqual(toks, toks2)
tokenizer = self.get_tokenizer(do_lower_case=False)
added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 4)
toks = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2)
self.assertEqual(len(toks), len(toks2)) # Length should still be the same
self.assertNotEqual(len(toks), len(toks0))
self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ
def test_add_tokens_tokenizer(self):
tokenizer = self.get_tokenizer()
......@@ -130,7 +164,7 @@ class CommonTestCases:
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 4)
......@@ -148,7 +182,8 @@ class CommonTestCases:
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
add_special_tokens=False)
out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 6)
......@@ -159,6 +194,26 @@ class CommonTestCases:
self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
def test_add_special_tokens(self):
tokenizer = self.get_tokenizer()
input_text, output_text = self.get_input_output_texts()
special_token = "[SPECIAL TOKEN]"
tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
assert len(encoded_special_token) == 1
text = " ".join([input_text, special_token, output_text])
encoded = tokenizer.encode(text, add_special_tokens=False)
input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
assert encoded == input_encoded + special_token_id + output_encoded
decoded = tokenizer.decode(encoded, skip_special_tokens=True)
assert special_token not in decoded
def test_required_methods_tokenizer(self):
tokenizer = self.get_tokenizer()
......@@ -166,7 +221,7 @@ class CommonTestCases:
tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(input_text)
ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
self.assertListEqual(ids, ids_2)
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
......@@ -206,7 +261,7 @@ class CommonTestCases:
seq_0 = "Test this method."
seq_1 = "With these inputs."
sequences = tokenizer.encode(seq_0, seq_1)
sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
# Method is implemented (e.g. not GPT-2)
......@@ -219,10 +274,14 @@ class CommonTestCases:
seq_0 = "This is a sentence to be encoded."
stride = 2
sequence = tokenizer.encode(seq_0)
sequence = tokenizer.encode(seq_0, add_special_tokens=False)
num_added_tokens = tokenizer.num_added_tokens()
total_length = len(sequence) + num_added_tokens
information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
information = tokenizer.encode_plus(seq_0,
max_length=total_length - 2,
add_special_tokens=True,
stride=stride,
return_overflowing_tokens=True)
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
......@@ -239,20 +298,22 @@ class CommonTestCases:
seq_1 = "This is another sentence to be encoded."
stride = 2
sequence_0_no_special_tokens = tokenizer.encode(seq_0)
sequence_1_no_special_tokens = tokenizer.encode(seq_1)
sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
tokenizer.encode(seq_0),
tokenizer.encode(seq_1)[:-2]
tokenizer.encode(seq_0, add_special_tokens=False),
tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
)
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
stride=stride, truncation_strategy='only_second')
stride=stride, truncation_strategy='only_second',
return_overflowing_tokens=True)
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
add_special_tokens=True, stride=stride,
truncation_strategy='only_first')
truncation_strategy='only_first',
return_overflowing_tokens=True)
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
......@@ -283,8 +344,8 @@ class CommonTestCases:
sequence_1 = "This one too please."
# Testing single inputs
encoded_sequence = tokenizer.encode(sequence_0)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
......@@ -294,8 +355,10 @@ class CommonTestCases:
self.assertEqual(encoded_sequence, filtered_sequence)
# Testing inputs pairs
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True,
return_special_tokens_mask=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
......@@ -307,9 +370,98 @@ class CommonTestCases:
# Testing with already existing special tokens
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0,
add_special_tokens=True,
return_special_tokens_mask=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
def test_padding_to_max_length(self):
tokenizer = self.get_tokenizer()
sequence = "Sequence"
padding_size = 10
padding_idx = tokenizer.pad_token_id
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "left"
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
padded_sequence_left_length = len(padded_sequence_left)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
assert sequence_length == padded_sequence_left_length
assert encoded_sequence == padded_sequence_left
def test_encode_plus_with_padding(self):
tokenizer = self.get_tokenizer()
sequence = "Sequence"
padding_size = 10
padding_idx = tokenizer.pad_token_id
token_type_padding_idx = tokenizer.pad_token_type_id
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence['input_ids']
token_type_ids = encoded_sequence['token_type_ids']
attention_mask = encoded_sequence['attention_mask']
special_tokens_mask = encoded_sequence['special_tokens_mask']
sequence_length = len(input_ids)
# Test right padding
tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
padded_input_ids = padded_sequence['input_ids']
padded_token_type_ids = padded_sequence['token_type_ids']
padded_attention_mask = padded_sequence['attention_mask']
padded_special_tokens_mask = padded_sequence['special_tokens_mask']
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length
assert input_ids + [padding_idx] * padding_size == padded_input_ids
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
assert attention_mask + [0] * padding_size == padded_attention_mask
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
# Test left padding
tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
padded_input_ids = padded_sequence['input_ids']
padded_token_type_ids = padded_sequence['token_type_ids']
padded_attention_mask = padded_sequence['attention_mask']
padded_special_tokens_mask = padded_sequence['special_tokens_mask']
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + input_ids == padded_input_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
assert [0] * padding_size + attention_mask == padded_attention_mask
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
\ No newline at end of file
......@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os
import unittest
import pytest
from io import open
from transformers import is_torch_available
......@@ -24,11 +23,12 @@ from transformers import is_torch_available
if is_torch_available():
import torch
from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
else:
pytestmark = pytest.mark.skip("Require Torch") # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
from .tokenization_tests_commons import CommonTestCases
from .utils import require_torch
@require_torch
class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
......
......@@ -22,7 +22,10 @@ import six
from transformers import PreTrainedTokenizer
from transformers.tokenization_gpt2 import GPT2Tokenizer
from .utils import slow
class TokenizerUtilsTest(unittest.TestCase):
def check_tokenizer_from_pretrained(self, tokenizer_class):
s3_models = list(tokenizer_class.max_model_input_sizes.keys())
for model_name in s3_models[:1]:
......@@ -39,6 +42,7 @@ class TokenizerUtilsTest(unittest.TestCase):
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
self.assertIsInstance(special_tok_id, int)
@slow
def test_pretrained_tokenizers(self):
self.check_tokenizer_from_pretrained(GPT2Tokenizer)
......
......@@ -21,6 +21,7 @@ import json
from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import CommonTestCases
from .utils import slow
class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
......@@ -66,11 +67,12 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
@slow
def test_sequence_builders(self):
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
......
......@@ -20,6 +20,7 @@ import unittest
from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
from .tokenization_tests_commons import CommonTestCases
from .utils import slow
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'fixtures/test_sentencepiece.model')
......@@ -89,11 +90,12 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
@slow
def test_sequence_builders(self):
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
......
import os
import unittest
from distutils.util import strtobool
from transformers.file_utils import _tf_available, _torch_available
SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
def parse_flag_from_env(key, default=False):
try:
value = os.environ[key]
except KeyError:
# KEY isn't set, default to `default`.
_value = default
else:
# KEY is set, convert it to True or False.
try:
_value = strtobool(value)
except ValueError:
# More values are supported, but let's keep the message simple.
raise ValueError("If set, {} must be yes or no.".format(key))
return _value
_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
def slow(test_case):
"""
Decorator marking a test as slow.
Slow tests are skipped by default. Set the RUN_SLOW environment variable
to a truthy value to run them.
"""
if not _run_slow_tests:
test_case = unittest.skip("test is slow")(test_case)
return test_case
def custom_tokenizers(test_case):
"""
Decorator marking a test for a custom tokenizer.
Custom tokenizers require additional dependencies, and are skipped
by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
to a truthy value to run them.
"""
if not _run_custom_tokenizers:
test_case = unittest.skip("test of custom tokenizers")(test_case)
return test_case
def require_torch(test_case):
"""
Decorator marking a test that requires PyTorch.
These tests are skipped when PyTorch isn't installed.
"""
if not _torch_available:
test_case = unittest.skip("test requires PyTorch")(test_case)
return test_case
def require_tf(test_case):
"""
Decorator marking a test that requires TensorFlow.
These tests are skipped when TensorFlow isn't installed.
"""
if not _tf_available:
test_case = unittest.skip("test requires TensorFlow")(test_case)
return test_case
if _torch_available:
# Set the USE_CUDA environment variable to select a GPU.
torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
else:
torch_device = None
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from .tokenization_utils import PreTrainedTokenizer
import logging
import unicodedata
import six
import os
from shutil import copyfile
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'albert-base-v1': 512,
'albert-large-v1': 512,
'albert-xlarge-v1': 512,
'albert-xxlarge-v1': 512,
'albert-base-v2': 512,
'albert-large-v2': 512,
'albert-xlarge-v2': 512,
'albert-xxlarge-v2': 512,
}
SPIECE_UNDERLINE = u'▁'
class AlbertTokenizer(PreTrainedTokenizer):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file,
do_lower_case=True, remove_space=True, keep_accents=False,
bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]", **kwargs):
super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
@property
def vocab_size(self):
return len(self.sp_model)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = ' '.join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode('utf-8')
if not self.keep_accents:
outputs = unicodedata.normalize('NFKD', outputs)
outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
return outputs
def _tokenize(self, text, return_unicode=True, sample=False):
""" Tokenize a string.
return_unicode is used only for py2
"""
text = self.preprocess_text(text)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode):
text = text.encode('utf-8')
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
else:
cur_pieces[0] = cur_pieces[0][1:]
cur_pieces.append(piece[-1])
new_pieces.extend(cur_pieces)
else:
new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode('utf-8')
ret_pieces.append(piece)
new_pieces = ret_pieces
return new_pieces
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index, return_unicode=True):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token = self.sp_model.IdToPiece(index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode('utf-8')
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
......@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging
from .tokenization_bert import BertTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_ctrl import CTRLTokenizer
......@@ -27,6 +28,8 @@ from .tokenization_xlnet import XLNetTokenizer
from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer
from .tokenization_camembert import CamembertTokenizer
from .tokenization_albert import AlbertTokenizer
logger = logging.getLogger(__name__)
......@@ -42,14 +45,16 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model)
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
This class cannot be instantiated using `__init__()` (throw an error).
"""
......@@ -65,19 +70,23 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (XLM model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model)
- contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
Params:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
......@@ -87,6 +96,9 @@ class AutoTokenizer(object):
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
......@@ -97,14 +109,26 @@ class AutoTokenizer(object):
Examples::
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
# Download vocabulary from S3 and cache.
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# Download vocabulary from S3 (user-uploaded) and cache.
tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
"""
if 'distilbert' in pretrained_model_name_or_path:
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'camembert' in pretrained_model_name_or_path:
return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert-base-japanese' in pretrained_model_name_or_path:
return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
......@@ -121,4 +145,4 @@ class AutoTokenizer(object):
return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
"'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
......@@ -220,7 +220,7 @@ class BertTokenizer(PreTrainedTokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import six
import unicodedata
from io import open
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'bert-base-japanese': 512,
'bert-base-japanese-whole-word-masking': 512,
'bert-base-japanese-char': 512,
'bert-base-japanese-char-whole-word-masking': 512
}
PRETRAINED_INIT_CONFIGURATION = {
'bert-base-japanese': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'wordpiece'
},
'bert-base-japanese-whole-word-masking':{
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'wordpiece'
},
'bert-base-japanese-char': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'character'
},
'bert-base-japanese-char-whole-word-masking': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'character'
}
}
class BertJapaneseTokenizer(BertTokenizer):
"""BERT tokenizer for Japanese text"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, do_lower_case=False,
do_word_tokenize=True, do_subword_tokenize=True,
word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
never_split=None, unk_token='[UNK]', sep_token='[SEP]',
pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
"""Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True)
Whether to do subword tokenization.
**word_tokenizer_type**: (`optional`) string (default "basic")
Type of word tokenizer.
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer.
"""
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_word_tokenize = do_word_tokenize
if do_word_tokenize:
if word_tokenizer_type == 'basic':
self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=False)
elif word_tokenizer_type == 'mecab':
self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
else:
raise ValueError(
"Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
self.do_subword_tokenize = do_subword_tokenize
if do_subword_tokenize:
if subword_tokenizer_type == 'wordpiece':
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
elif subword_tokenizer_type == 'character':
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
else:
raise ValueError(
"Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
def _tokenize(self, text):
if self.do_word_tokenize:
tokens = self.word_tokenizer.tokenize(text,
never_split=self.all_special_tokens)
else:
tokens = [text]
if self.do_subword_tokenize:
split_tokens = [sub_token for token in tokens
for sub_token in self.subword_tokenizer.tokenize(token)]
else:
split_tokens = tokens
return split_tokens
class MecabTokenizer(object):
"""Runs basic tokenization with MeCab morphological parser."""
def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split if never_split is not None else []
self.normalize_text = normalize_text
import MeCab
self.mecab = MeCab.Tagger()
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize('NFKC', text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
if six.PY2:
mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
else:
mecab_output = self.mecab.parse(text)
cursor = 0
for line in mecab_output.split('\n'):
if line == 'EOS':
break
token, _ = line.split('\t')
token_start = text.index(token, cursor)
token_end = token_start + len(token)
if self.do_lower_case and token not in never_split:
token = token.lower()
tokens.append(token)
cursor = token_end
return tokens
class CharacterTokenizer(object):
"""Runs Character tokenziation."""
def __init__(self, vocab, unk_token, normalize_text=True):
"""Constructs a CharacterTokenizer.
Args:
**vocab**:
Vocabulary object.
**unk_token**: str
A special symbol for out-of-vocabulary token.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self.vocab = vocab
self.unk_token = unk_token
self.normalize_text = normalize_text
def tokenize(self, text):
"""Tokenizes a piece of text into characters.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args:
text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`.
Returns:
A list of characters.
"""
if self.normalize_text:
text = unicodedata.normalize('NFKC', text)
output_tokens = []
for i, char in enumerate(text):
if char not in self.vocab:
output_tokens.append(self.unk_token)
continue
output_tokens.append(char)
return output_tokens
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import os
from shutil import copyfile
import sentencepiece as spm
from transformers.tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'camembert-base': None,
}
class CamembertTokenizer(PreTrainedTokenizer):
"""
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
additional_special_tokens=['<s>NOTUSED', '</s>NOTUSED'], **kwargs):
super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, additional_special_tokens=additional_special_tokens,
**kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
# sentencepiece vocabulary (this is the case for <s> and </s>
self.fairseq_tokens_to_ids = {'<s>NOTUSED': 0, '<pad>': 1, '</s>NOTUSED': 2, '<unk>': 3}
self.fairseq_offset = len(self.fairseq_tokens_to_ids)
self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
@property
def vocab_size(self):
return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
def _tokenize(self, text):
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
elif self.sp_model.PieceToId(token) == 0:
# Convert sentence piece unk token to fairseq unk token index
return self.unk_token_id
return self.fairseq_offset + self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
......@@ -46,6 +46,64 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ctrl': 256,
}
CONTROL_CODES = {
"Pregnancy": 168629,
"Christianity": 7675,
"Explain": 106423,
"Fitness": 63440,
"Saving": 63163,
"Ask": 27171,
"Ass": 95985,
"Joke": 163509,
"Questions": 45622,
"Thoughts": 49605,
"Retail": 52342,
"Feminism": 164338,
"Writing": 11992,
"Atheism": 192263,
"Netflix": 48616,
"Computing": 39639,
"Opinion": 43213,
"Alone": 44967,
"Funny": 58917,
"Gaming": 40358,
"Human": 4088,
"India": 1331,
"Joker": 77138,
"Diet": 36206,
"Legal": 11859,
"Norman": 4939,
"Tip": 72689,
"Weight": 52343,
"Movies": 46273,
"Running": 23425,
"Science": 2090,
"Horror": 37793,
"Confession": 60572,
"Finance": 12250,
"Politics": 16360,
"Scary": 191985,
"Support": 12654,
"Technologies": 32516,
"Teenage": 66160,
"Event": 32769,
"Learned": 67460,
"Notion": 182770,
"Wikipedia": 37583,
"Books": 6665,
"Extract": 76050,
"Confessions": 102701,
"Conspiracy": 75932,
"Links": 63674,
"Narcissus": 150425,
"Relationship": 54766,
"Relationships": 134796,
"Reviews": 41671,
"News": 4256,
"Translation": 26820,
"multilingual": 128406,
}
def get_pairs(word):
"""Return set of symbol pairs in a word.
......@@ -63,24 +121,23 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer):
"""
CTRL BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
- Byte-Pair-Encoding
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
control_codes = CONTROL_CODES
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
......@@ -137,9 +194,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
"""
split_tokens = []
text = text.split(' ')
words = re.findall(r'\S+\n?', text)
for token in text:
for token in words:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
return split_tokens
......
......@@ -33,12 +33,16 @@ PRETRAINED_VOCAB_FILES_MAP = {
{
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'distilbert-base-uncased': 512,
'distilbert-base-uncased-distilled-squad': 512,
'distilbert-base-german-cased': 512,
'distilbert-base-multilingual-cased': 512,
}
......
......@@ -46,6 +46,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
},
'merges_file':
......@@ -53,6 +54,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
},
}
......@@ -61,6 +63,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2': 1024,
'gpt2-medium': 1024,
'gpt2-large': 1024,
'gpt2-xl': 1024,
'distilgpt2': 1024,
}
......@@ -104,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
"""
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
- Requires a space to start the input string => the encoding and tokenize methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
......@@ -119,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
with open(merges_file, encoding='utf-8') as merges_handle:
bpe_merges = merges_handle.read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
......@@ -181,7 +186,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" Tokenize a string.
Args:
- add_prefix_space (boolean, default False):
Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
if add_prefix_space:
text = ' ' + text
......
......@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
......
......@@ -47,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
},
'merges_file':
{
......@@ -54,6 +56,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
},
}
......@@ -62,6 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-large': 512,
'roberta-large-mnli': 512,
'distilroberta-base': 512,
'roberta-base-openai-detector': 512,
'roberta-large-openai-detector': 512,
}
......@@ -114,7 +120,7 @@ class RobertaTokenizer(GPT2Tokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
......
......@@ -21,9 +21,11 @@ import os
import json
import six
import copy
import itertools
import re
from io import open
from .file_utils import cached_path, is_tf_available, is_torch_available
from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
if is_tf_available():
import tensorflow as tf
......@@ -76,6 +78,8 @@ class PreTrainedTokenizer(object):
"pad_token", "cls_token", "mask_token",
"additional_special_tokens"]
padding_side = "right"
@property
def bos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
......@@ -189,6 +193,11 @@ class PreTrainedTokenizer(object):
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
return self.convert_tokens_to_ids(self.pad_token)
@property
def pad_token_type_id(self):
""" Id of the padding token type in the vocabulary."""
return self._pad_token_type_id
@property
def cls_token_id(self):
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
......@@ -212,10 +221,14 @@ class PreTrainedTokenizer(object):
self._pad_token = None
self._cls_token = None
self._mask_token = None
self._pad_token_type_id = 0
self._additional_special_tokens = []
self.max_len = max_len if max_len is not None else int(1e12)
# Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
self.padding_side = kwargs.pop('padding_side', self.padding_side)
# Added tokens
self.added_tokens_encoder = {}
self.added_tokens_decoder = {}
......@@ -242,6 +255,7 @@ class PreTrainedTokenizer(object):
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
......@@ -251,6 +265,9 @@ class PreTrainedTokenizer(object):
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
......@@ -266,6 +283,9 @@ class PreTrainedTokenizer(object):
# Download vocabulary from S3 and cache.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Download vocabulary from S3 (user-uploaded) and cache.
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
......@@ -286,6 +306,7 @@ class PreTrainedTokenizer(object):
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False)
resume_download = kwargs.pop('resume_download', False)
proxies = kwargs.pop('proxies', None)
s3_models = list(cls.max_model_input_sizes.keys())
......@@ -310,12 +331,15 @@ class PreTrainedTokenizer(object):
if os.path.isdir(pretrained_model_name_or_path):
# If a directory is provided we look for the standard filenames
full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
else:
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name = pretrained_model_name_or_path
if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name = pretrained_model_name_or_path
else:
full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
vocab_files[file_id] = full_file_name
# Look for the additional tokens files
......@@ -352,7 +376,7 @@ class PreTrainedTokenizer(object):
if file_path is None:
resolved_vocab_files[file_id] = None
else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
except EnvironmentError:
if pretrained_model_name_or_path in s3_models:
msg = "Couldn't reach server at '{}' to download vocabulary files."
......@@ -377,7 +401,8 @@ class PreTrainedTokenizer(object):
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
if tokenizer_config_file is not None:
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
init_kwargs = json.load(tokenizer_config_handle)
saved_init_inputs = init_kwargs.pop('init_inputs', ())
if not init_inputs:
init_inputs = saved_init_inputs
......@@ -402,7 +427,8 @@ class PreTrainedTokenizer(object):
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
if special_tokens_map_file is not None:
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle)
for key, value in special_tokens_map.items():
if key not in init_kwargs:
init_kwargs[key] = value
......@@ -416,7 +442,8 @@ class PreTrainedTokenizer(object):
# Add supplementary tokens.
if added_tokens_file is not None:
added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
tokenizer.added_tokens_encoder.update(added_tok_encoder)
tokenizer.added_tokens_decoder.update(added_tok_decoder)
......@@ -512,6 +539,8 @@ class PreTrainedTokenizer(object):
to_add_tokens = []
for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
token = token.lower()
if token != self.unk_token and \
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
token not in to_add_tokens:
......@@ -604,7 +633,23 @@ class PreTrainedTokenizer(object):
vocabularies (BPE/SentencePieces/WordPieces).
Take care of added tokens.
text: The sequence to be encoded.
**kwargs: passed to the child `self.tokenize()` method
"""
def lowercase_text(t):
# convert non-special tokens to lowercase
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
r'(.+?)'
return re.sub(
pattern,
lambda m: m.groups()[0] or m.groups()[1].lower(),
t)
if self.init_kwargs.get('do_lower_case', False):
text = lowercase_text(text)
def split_on_token(tok, text):
result = []
split_text = text.split(tok)
......@@ -624,7 +669,7 @@ class PreTrainedTokenizer(object):
return result
def split_on_tokens(tok_list, text):
if not text:
if not text.strip():
return []
if not tok_list:
return self._tokenize(text, **kwargs)
......@@ -641,9 +686,9 @@ class PreTrainedTokenizer(object):
tokenized_text += [sub_text]
text_list = tokenized_text
return sum((self._tokenize(token, **kwargs) if token not \
return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
in self.added_tokens_encoder and token not in self.all_special_tokens \
else [token] for token in tokenized_text), [])
else [token] for token in tokenized_text)))
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
tokenized_text = split_on_tokens(added_tokens, text)
......@@ -671,10 +716,6 @@ class PreTrainedTokenizer(object):
ids = []
for token in tokens:
ids.append(self._convert_token_to_id_with_added_voc(token))
if len(ids) > self.max_len:
logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors".format(len(ids), self.max_len))
return ids
def _convert_token_to_id_with_added_voc(self, token):
......@@ -691,10 +732,11 @@ class PreTrainedTokenizer(object):
def encode(self,
text,
text_pair=None,
add_special_tokens=False,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy='longest_first',
pad_to_max_length=False,
return_tensors=None,
**kwargs):
"""
......@@ -721,6 +763,12 @@ class PreTrainedTokenizer(object):
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method
......@@ -731,6 +779,7 @@ class PreTrainedTokenizer(object):
add_special_tokens=add_special_tokens,
stride=stride,
truncation_strategy=truncation_strategy,
pad_to_max_length=pad_to_max_length,
return_tensors=return_tensors,
**kwargs)
......@@ -739,11 +788,16 @@ class PreTrainedTokenizer(object):
def encode_plus(self,
text,
text_pair=None,
add_special_tokens=False,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy='longest_first',
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
**kwargs):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
......@@ -768,9 +822,40 @@ class PreTrainedTokenizer(object):
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
**kwargs: passed to the `self.tokenize()` method
Return:
A Dictionary of shape::
{
input_ids: list[int],
token_type_ids: list[int] if return_token_type_ids is True (default)
attention_mask: list[int] if return_attention_mask is True (default)
overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
}
With the fields:
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
def get_input_ids(text):
......@@ -789,13 +874,24 @@ class PreTrainedTokenizer(object):
return self.prepare_for_model(first_ids,
pair_ids=second_ids,
max_length=max_length,
pad_to_max_length=pad_to_max_length,
add_special_tokens=add_special_tokens,
stride=stride,
truncation_strategy=truncation_strategy,
return_tensors=return_tensors)
return_tensors=return_tensors,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask)
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
truncation_strategy='longest_first', return_tensors=None):
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
truncation_strategy='longest_first',
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
......@@ -818,23 +914,36 @@ class PreTrainedTokenizer(object):
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
The tokenizer padding sides are handled by the following strings:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
Defaults to False: no padding.
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
Return:
A Dictionary of shape::
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
token_type_ids: list[int] if return_token_type_ids is True (default)
overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
}
With the fields:
``input_ids``: list of tokens to be fed to a model
``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
......@@ -843,40 +952,99 @@ class PreTrainedTokenizer(object):
len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {}
# Handle max sequence length
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
if max_length and total_len > max_length:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
num_tokens_to_remove=total_len-max_length,
truncation_strategy=truncation_strategy,
stride=stride)
if return_overflowing_tokens:
encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_length
# Handle special_tokens
if add_special_tokens:
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
else:
sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
if return_tensors == 'tf' and is_tf_available():
sequence = tf.constant([sequence])
token_type_ids = tf.constant([token_type_ids])
elif return_tensors == 'pt' and is_torch_available():
sequence = torch.tensor([sequence])
token_type_ids = torch.tensor([token_type_ids])
elif return_tensors is not None:
logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
encoded_inputs["input_ids"] = sequence
if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids
if max_length and len(encoded_inputs["input_ids"]) > max_length:
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
if return_token_type_ids:
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors".format(len(ids), self.max_len))
needs_to_be_padded = pad_to_max_length and (
max_length and len(encoded_inputs["input_ids"]) < max_length
or
max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
)
if pad_to_max_length and max_length is None and self.max_len > 10000:
logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
if needs_to_be_padded:
difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
if self.padding_side == 'right':
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
if return_token_type_ids:
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
elif self.padding_side == 'left':
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
if return_token_type_ids:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
elif return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
# Prepare inputs as tensors if asked
if return_tensors == 'tf' and is_tf_available():
encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
elif return_tensors == 'pt' and is_torch_available():
encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
elif return_tensors is not None:
logger.warning(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
return_tensors))
return encoded_inputs
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
......@@ -920,7 +1088,6 @@ class PreTrainedTokenizer(object):
return (ids, pair_ids, overflowing_tokens)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
logger.warning("This tokenizer does not make use of special tokens.")
if token_ids_1 is None:
return len(token_ids_0) * [0]
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
......@@ -933,7 +1100,6 @@ class PreTrainedTokenizer(object):
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
......@@ -951,7 +1117,7 @@ class PreTrainedTokenizer(object):
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
......@@ -1055,7 +1221,7 @@ class PreTrainedTokenizer(object):
class attributes (cls_token, unk_token...).
"""
all_toks = self.all_special_tokens
all_ids = list(self._convert_token_to_id(t) for t in all_toks)
all_ids = self.convert_tokens_to_ids(all_toks)
return all_ids
@staticmethod
......
......@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
"""Tokenization classes for XLM."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
......@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
additional_special_tokens=additional_special_tokens,
**kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
# cache of sm.MosesPunctNormalizer instance
self.cache_moses_punct_normalizer = dict()
# cache of sm.MosesTokenizer instance
......@@ -564,9 +568,11 @@ class XLMTokenizer(PreTrainedTokenizer):
self.ja_word_tokenizer = None
self.zh_word_tokenizer = None
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
......@@ -758,9 +764,9 @@ class XLMTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
A XLM sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
pair of sequences: <s> A </s> B </s>
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......@@ -781,7 +787,7 @@ class XLMTokenizer(PreTrainedTokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
......
......@@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
padding_side = "left"
def __init__(self, vocab_file,
do_lower_case=False, remove_space=True, keep_accents=False,
......@@ -74,6 +75,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
self._pad_token_type_id = 3
try:
import sentencepiece as spm
......@@ -141,7 +143,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
......@@ -185,9 +187,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
An XLNet sequence has the following format:
single sequence: X <sep> <cls>
pair of sequences: A <sep> B <sep> <cls>
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -208,7 +210,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
......@@ -224,7 +226,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
An XLNet sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment