Commit 5988d2cc authored by yuguo960516's avatar yuguo960516
Browse files

bert-large

parent 478602ba
Pipeline #142 canceled with stages
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from libai.config import instantiate
logger = logging.getLogger(__name__)
def build_tokenizer(cfg):
"""Initialize tokenizer."""
tokenizer = instantiate(cfg.tokenizer)
if cfg.append_eod and tokenizer.eod_token is None:
if tokenizer.eos_token is not None:
tokenizer.eod_token = tokenizer.eos_token
else:
tokenizer.eod_token = tokenizer.pad_token
return tokenizer
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""copy from HuggingFace transformer repo, to tokenize the sentence.
This class only focus on tokenization, converting token to id and their inverse operation.
It does not construct inputs using special symbols."""
import copy
import itertools
import json
import logging
import os
import unicodedata
from io import open
from typing import Dict, List, Optional, Union
import numpy as np
import oneflow as flow
from libai.utils import distributed as dist
from libai.utils.file_io import PathManager
from libai.utils.file_utils import cached_path
logger = logging.getLogger(__name__)
def _is_whitespace(char):
"""Checks whether `char` is a whitespace character."""
# \t, \n, and \r are technically control characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `char` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `char` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if (
(cp >= 33 and cp <= 47)
or (cp >= 58 and cp <= 64)
or (cp >= 91 and cp <= 96)
or (cp >= 123 and cp <= 126)
):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
class PreTrainedTokenizer(object):
"""
Base class for all tokenizers.
Handle all the shared methods for tokenization and special tokens, methods
dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
This class also contains the added tokens in a unified way on top of all tokenizers, so we don't
have to handle the specific vocabulary augmentation methods of the various underlying
dictionary structures (BPE, sentencepiece...).
Class attributes (overridden by derived classes):
``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of
each vocabulary file required by the model, and as associated values, the filename for
saving the associated file (string).
``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the
``__init__`` keyword name of each vocabulary file required by the model, the low-level
being the `short-cut-names` (string) of the pretrained models with, as associated values,
the `url` (string) to the associated pretrained vocabulary file.
``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string)
of the pretrained models, and as associated values, the maximum length of the sequence
inputs of this model, or None if the model has no maximum input size.
``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names`
(string) of the pretrained models, and as associated values, a dictionnary of specific
arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained
model when loading the tokenizer with the ``from_pretrained()`` method.
Args:
bos_token (:obj:`str`, `optional`): A special token representing the beginning of a
sentence.
eos_token (:obj:`str`, `optional`): A special token representing the end of a sentence.
unk_token (:obj:`str`, `optional`): A special token representing an out-of-vocabulary token.
sep_token (:obj:`str`, `optional`): A special token separating two different sentences in
the same input (used by BERT for instance).
pad_token (:obj:`str`, `optional`): A special token used to make arrays of tokens the same
size for batching purpose.
Will then be ignored by attention mechanisms or loss computation.
cls_token (:obj:`str`, `optional`): A special token representing the class of the input
(used by BERT for instance).
mask_token (:obj:`str`, `optional`): A special token representing a masked token (used by
masked-language modeling pretraining objectives, like BERT).
eod_token (:obj:`str`, `optional`): A special token representing the end of a document.
additional_special_tokens (tuple or list of :obj:`str`, `optional`):
A tuple or a list of additional special tokens.
"""
vocab_files_names = {}
pretrained_vocab_files_map = {}
pretrained_init_configuration = {}
max_model_input_sizes = {}
SPECIAL_TOKENS_ATTRIBUTES = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"eod_token",
"additional_special_tokens",
]
def __init__(self, verbose=True, **kwargs):
self._bos_token = None
self._eos_token = None
self._unk_token = None
self._sep_token = None
self._pad_token = None
self._cls_token = None
self._mask_token = None
self._eod_token = None
self._additional_special_tokens = []
self.verbose = verbose
# Added tokens - We store this for both slow and fast tokenizers
# until the serialization of Fast tokenizers is updated
self.added_tokens_encoder: Dict[str, int] = {}
self.added_tokens_decoder: Dict[int, str] = {}
self.unique_no_split_tokens: List[str] = []
# inputs and kwargs for saving and re-loading
# (see ``from_pretrained`` and ``save_pretrained``)
self.init_inputs = ()
self.init_kwargs = {}
# We directly set the hidden value to allow initialization with special tokens
# which are not yet in the vocabulary. Necessary for serialization/de-serialization
for key, value in kwargs.items():
if value is None:
continue
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens":
assert all(
isinstance(t, str) for t in value
), "One of the tokens is not a string"
setattr(self, key, list(value))
elif isinstance(value, str):
setattr(self, key, value)
else:
raise TypeError(f"special token {key} has to be str but got: {type(value)}")
@classmethod
def from_pretrained(cls, *inputs, **kwargs):
r"""
Instantiate a :class:`~PreTrainedTokenizer` (or a derived class) from a
predefined tokenizer.
Args:
pretrained_model_name_or_path(`str` or `os.PathLike`):
Can be either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache
or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing vocabulary files required by the tokenizer,
for instance saved using the :func:`~PreTrainedTokenizer.save_pretrained`
method, e.g., ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved
vocabulary file if and only if the tokenizer only requires a single vocabulary
file (e.g. Bert, XLNet), e.g., ``./my_model_directory/vocab.txt``.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded predefined tokenizer vocabulary files
should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if
they exist.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint,
e.g., {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the
Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the
Tokenizer ``__init__`` method. Can be used to set special tokens
like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``,
``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``.
See parameters in the doc string of :class:`~PreTrainedTokenizer`
for details.
Examples:
.. code-block:: python
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's
# show our examples on a derived class: BertTokenizer
# Download vocabulary from S3 and cache.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# If vocabulary files are in a directory (e.g. tokenizer was
# saved using `save_pretrained('./test/saved_model/')`)
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
# If the tokenizer uses a single vocabulary file, you can point directly to this file
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
# You can link tokens to special vocabulary when instantiating
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
# You should be sure '<unk>' is in the vocabulary when doing that.
# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
assert tokenizer.unk_token == '<unk>'
"""
return cls._from_pretrained(*inputs, **kwargs)
@classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
s3_models = list(cls.max_model_input_sizes.keys())
vocab_files = {}
init_configuration = {}
if pretrained_model_name_or_path in s3_models:
# Get the vocabulary from AWS S3 bucket
for file_id, map_list in cls.pretrained_vocab_files_map.items():
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
if (
cls.pretrained_init_configuration
and pretrained_model_name_or_path in cls.pretrained_init_configuration
):
init_configuration = cls.pretrained_init_configuration[
pretrained_model_name_or_path
]
else:
# Get the vocabulary from local files
logger.info(
"Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
pretrained_model_name_or_path,
", ".join(s3_models),
pretrained_model_name_or_path,
)
)
# Look for the tokenizer main vocabulary files
for file_id, file_name in cls.vocab_files_names.items():
if os.path.isdir(pretrained_model_name_or_path):
# If a directory is provided we look for the standard filenames
full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
else:
# If a path to a file is provided we use it (will only work for non-BPE
# tokenizer using a single vocabulary file)
full_file_name = pretrained_model_name_or_path
if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None
vocab_files[file_id] = full_file_name
# Look for the additional tokens files
additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE,
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
}
# If a path to a file was provided, get the parent directory
saved_directory = pretrained_model_name_or_path
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
saved_directory = os.path.dirname(saved_directory)
for file_id, file_name in additional_files_names.items():
full_file_name = os.path.join(saved_directory, file_name)
if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None
vocab_files[file_id] = full_file_name
if all(full_file_name is None for full_file_name in vocab_files.values()):
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find tokenizer files"
"at this path or url.".format(
pretrained_model_name_or_path,
", ".join(s3_models),
pretrained_model_name_or_path,
)
)
return None
# Get files from url, cache, or disk depending on the case
try:
resolved_vocab_files = {}
for file_id, file_path in vocab_files.items():
if file_path is None:
resolved_vocab_files[file_id] = None
else:
resolved_vocab_files[file_id] = cached_path(
file_path,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
)
except EnvironmentError as e:
if pretrained_model_name_or_path in s3_models:
logger.error("Couldn't reach server to download vocabulary.")
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} "
"at this path or url.".format(
pretrained_model_name_or_path,
", ".join(s3_models),
pretrained_model_name_or_path,
str(vocab_files.keys()),
)
)
raise e
for file_id, file_path in vocab_files.items():
if file_path == resolved_vocab_files[file_id]:
logger.info("loading file {}".format(file_path))
else:
logger.info(
"loading file {} from cache at {}".format(
file_path, resolved_vocab_files[file_id]
)
)
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
if tokenizer_config_file is not None:
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
saved_init_inputs = init_kwargs.pop("init_inputs", ())
if not init_inputs:
init_inputs = saved_init_inputs
else:
init_kwargs = init_configuration
# Update with newly provided kwargs
init_kwargs.update(kwargs)
# Merge resolved_vocab_files arguments in init_kwargs.
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
for args_name, file_path in resolved_vocab_files.items():
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
if special_tokens_map_file is not None:
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
for key, value in special_tokens_map.items():
if key not in init_kwargs:
init_kwargs[key] = value
# Instantiate tokenizer.
tokenizer = cls(*init_inputs, **init_kwargs)
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
tokenizer.init_inputs = init_inputs
tokenizer.init_kwargs = init_kwargs
# Add supplementary tokens.
special_tokens = tokenizer.all_special_tokens
if added_tokens_file is not None:
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
# Sort added tokens by index
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
for token, index in added_tok_encoder_sorted:
assert index == len(tokenizer), (
f"Non-consecutive added token '{token}' found. "
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
)
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
# Check all our special tokens are registered as "no split" token
# (we don't cut them) and are in the vocab
added_tokens = tokenizer.sanitize_special_tokens()
if added_tokens:
logger.warning(
"Special tokens have been added in the vocabulary,"
"make sure the associated word embedding are fine-tuned or trained."
)
return tokenizer
def save_pretrained(self, save_directory):
"""
Save the tokenizer vocabulary files together with:
- added tokens,
- special-tokens-to-class-attributes-mapping,
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
This won't save modifications other than ``added tokens`` and ``special token mapping``,
you may have applied to the tokenizer after the instantiation (e.g. modifying
tokenizer.do_lower_case after creation).
This method make sure the full tokenizer can then be re-loaded using the
:func:`~PreTrainedTokenizer.from_pretrained` class method.
"""
if not PathManager.isdir(save_directory):
logger.error("Saving directory ({}) should be a directory".format(save_directory))
return
PathManager.mkdirs(save_directory)
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
tokenizer_config = copy.deepcopy(self.init_kwargs)
if len(self.init_inputs) > 0:
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
added_vocab = self.get_added_vocab()
if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f:
out_str = json.dumps(added_vocab, ensure_ascii=False)
f.write(out_str)
vocab_files = self.save_vocabulary(save_directory)
return vocab_files + (special_tokens_map_file, added_tokens_file)
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
and special token mappings.
Please use :func:`~PreTrainedTokenizer.save_pretrained` to save the
full Tokenizer state if you want to reload it using the
:func:`~PreTrainedTokenizer.from_pretrained` class method.
"""
raise NotImplementedError
@property
def vocab_size(self) -> int:
"""Size of the base vocabulary (without the added tokens)."""
raise NotImplementedError
def padded_vocab_size(self, multiple=1) -> int:
"""Padded the vocabulary with dummy tokens and return the new size."""
vocab_size = len(self)
while vocab_size % multiple != 0:
vocab_size += 1
return vocab_size
def __len__(self):
"""Size of the full vocabulary with the added tokens."""
return self.vocab_size + len(self.added_tokens_encoder)
def get_vocab(self) -> Dict[str, int]:
"""
Returns the vocabulary as a dictionary of token to index.
:obj:`tokenizer.get_vocab()[token]` is equivalent to
:obj:`tokenizer.convert_tokens_to_ids(token)`
when :obj:`token` is in the vocab.
Returns:
:obj:`Dict[str, int]`: The vocabulary.
"""
raise NotImplementedError
def get_added_vocab(self) -> Dict[str, int]:
"""
Returns the added tokens in the vocabulary as a dictionary of token to index.
Returns:
:obj:`Dict[str, int]`: The added tokens.
"""
return self.added_tokens_encoder
def add_tokens(self, new_tokens: Union[str, List[str]], special_tokens: bool = False) -> int:
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to it with indices starting from the length of
the current vocabulary.
.. Note::
When adding new tokens to the vocabulary, you should make sure to also resize
the token embedding matrix of the model so that its embedding matrix matches
the tokenizer.
In order to do that, please use the
:meth:`~PreTrainedModel.resize_token_embeddings` method.
Args:
new_tokens (:obj:`str`, or a list of `str`):
Tokens are only added if they are not already in the vocabulary.
special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Can be used to specify if the token is a special token. This mostly change
the normalization behavior
(special tokens like CLS or [MASK] are usually not lower-cased for instance).
Returns:
:obj:`int`: Number of tokens added to the vocabulary.
Examples:
.. code-block:: python
# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
print('We have added', num_added_toks, 'tokens')
# Notice: resize_token_embeddings expect to receive the full size of the new
# vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
"""
if not new_tokens:
return 0
if not isinstance(new_tokens, (list, tuple)):
new_tokens = [new_tokens]
tokens_to_add = []
for token in new_tokens:
if not isinstance(token, str):
raise TypeError(f"Token {token} is not a string but a {type(token)}.")
if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
token = token.lower()
if (
token != self.unk_token
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
and token not in tokens_to_add
):
tokens_to_add.append(token)
if self.verbose:
logger.info(f"Adding {token} to the vocabulary")
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
self.added_tokens_encoder.update(added_tok_encoder)
self.added_tokens_decoder.update(added_tok_decoder)
if special_tokens:
self.unique_no_split_tokens = sorted(
set(self.unique_no_split_tokens).union(set(new_tokens))
)
else:
self.unique_no_split_tokens = sorted(
set(self.unique_no_split_tokens).union(set(tokens_to_add))
)
return len(tokens_to_add)
def sanitize_special_tokens(self) -> int:
"""
Make sure that all the special tokens attributes of the tokenizer
(:obj:`tokenizer.mask_token`, :obj:`tokenizer.cls_token`, etc.)
are in the vocabulary.
Add the missing ones to the vocabulary if needed.
Return:
:obj:`int`: The number of tokens added in the vocaulary during the operation.
"""
return self.add_tokens(self.all_special_tokens, special_tokens=True)
def add_special_tokens(self, special_tokens_dict: Dict[str, str]) -> int:
"""
Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to
class attributes. If special tokens are NOT in the vocabulary, they are added to it
(indexed starting from the last index of the current vocabulary).
.. Note::
When adding new tokens to the vocabulary, you should make sure to also resize the
token embedding matrix of the model so that its embedding matrix matches the tokenizer.
In order to do that, please use the
:meth:`~PreTrainedModel.resize_token_embeddings` method.
Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
- Special tokens are carefully handled by the tokenizer (they are never split).
- You can easily refer to special tokens using tokenizer class attributes like
:obj:`tokenizer.cls_token`. This makes it easy to develop model-agnostic training and
fine-tuning scripts.
When possible, special tokens are already registered for provided pretrained models
(for instance :class:`~BertTokenizer` :obj:`cls_token` is already registered
to be :obj`'[CLS]'` and XLM's one is also registered to be :obj:`'</s>'`).
Args:
special_tokens_dict (dictionary `str` to `str`):
Keys should be in the list of predefined special attributes: [``bos_token``,
``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``,
``cls_token``, ``mask_token``,
``additional_special_tokens``].
Tokens are only added if they are not already in the vocabulary (tested by
checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns:
:obj:`int`: Number of tokens added to the vocabulary.
Examples:
.. code-block:: python
# Let's see how to add a new classification token to GPT-2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
special_tokens_dict = {'cls_token': '<CLS>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary,
# i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
assert tokenizer.cls_token == '<CLS>'
"""
if not special_tokens_dict:
return 0
added_tokens = 0
for key, value in special_tokens_dict.items():
assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
if self.verbose:
logger.info(f"Assigning {value} to the {key} key of the tokenizer")
setattr(self, key, value)
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all(
isinstance(t, str) for t in value
), f"Tokens {value} for key {key} should all be a string"
added_tokens += self.add_tokens(value, special_tokens=True)
else:
assert isinstance(value, str), f"Token {value} for key {key} should be a string"
added_tokens += self.add_tokens([value], special_tokens=True)
return added_tokens
def tokenize(self, text: str, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Take care of added tokens.
Args:
text (:obj:`str`):
The sequence to be encoded.
**kwargs (additional keyword arguments):
Passed along to the model-specific ``prepare_for_tokenization``
preprocessing method.
Returns:
:obj:`List[str]`: The list of tokens.
"""
def split_on_token(tok, text):
result = []
split_text = text.split(tok)
for i, sub_text in enumerate(split_text):
sub_text = sub_text.strip()
if i == 0 and not sub_text:
result += [tok]
elif i == len(split_text) - 1:
if sub_text:
result += [sub_text]
else:
pass
else:
if sub_text:
result += [sub_text]
result += [tok]
return result
def split_on_tokens(tok_list, text):
if not text:
return []
if not tok_list:
return self._tokenize(text, **kwargs)
tokenized_text = []
text_list = [text]
for tok in tok_list:
tokenized_text = []
for sub_text in text_list:
if sub_text not in self.unique_no_split_tokens:
tokenized_text += split_on_token(tok, sub_text)
else:
tokenized_text += [sub_text]
text_list = tokenized_text
return list(
itertools.chain.from_iterable(
(
self._tokenize(token)
if token not in self.unique_no_split_tokens
else [token]
for token in tokenized_text
)
)
)
no_split_token = self.unique_no_split_tokens
tokenized_text = split_on_tokens(no_split_token, text)
return tokenized_text
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for
word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces).
Do NOT take care of added tokens.
"""
raise NotImplementedError
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
"""Converts a token string (or a sequence of tokens) in a single integer id
(or a sequence of ids), using the vocabulary.
"""
if tokens is None:
return None
if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens)
if len(tokens) > 0 and isinstance(tokens[0], list):
ids = []
for ts in tokens:
ids_x = []
for token in ts:
ids_x.append(self._convert_token_to_id_with_added_voc(token))
ids.append(ids_x)
return ids
ids = []
for token in tokens:
ids.append(self._convert_token_to_id_with_added_voc(token))
return ids
def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, **kwargs):
if return_tensors is None:
return_token_ids = token_ids
elif return_tensors == "of":
if not is_global:
return_token_ids = flow.tensor(token_ids, dtype=flow.long)
elif is_global:
sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
placement = kwargs.get(
"placement", flow.placement("cuda", list(range(dist.get_world_size())))
)
return_token_ids = flow.tensor(
token_ids, sbp=sbp, placement=placement, dtype=flow.long
)
elif return_tensors == "np":
return_token_ids = np.array(token_ids, dtype=np.int64)
return return_token_ids
def _convert_token_to_id_with_added_voc(self, token):
if token is None:
return None
if token in self.added_tokens_encoder:
return self.added_tokens_encoder[token]
return self._convert_token_to_id(token)
def _convert_token_to_id(self, token):
raise NotImplementedError
def encode(self, text, return_tensors=None, is_global=False, **kwargs):
if isinstance(text, str):
tokens = self.tokenize(text)
token_ids = self.convert_tokens_to_ids(tokens)
token_ids = self.build_inputs_with_special_tokens(token_ids)
token_ids = self.convert_to_tensors(
token_ids, return_tensors=return_tensors, is_global=is_global, **kwargs
)
return token_ids
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
tokens = [self.tokenize(t) for t in text]
token_ids_list = self.convert_tokens_to_ids(tokens)
token_ids_list = [
self.build_inputs_with_special_tokens(token_ids) for token_ids in token_ids_list
]
token_ids_list = self.convert_to_tensors(
token_ids_list, return_tensors=return_tensors, is_global=is_global, **kwargs
)
return token_ids_list
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or "
"a list/tuple of integers."
)
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
"""
Converts a single index or a sequence of indices in a token or a sequence of tokens,
using the vocabulary and added tokens.
Args:
ids (:obj:`int` or :obj:`List[int]`):
The token id (or token ids) to convert to tokens.
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to remove special tokens in the decoding.
Returns:
:obj:`str` or :obj:`List[str]`: The decoded token(s).
"""
if isinstance(ids, int):
if ids in self.added_tokens_decoder:
return self.added_tokens_decoder[ids]
else:
return self._convert_id_to_token(ids)
tokens = []
for index in ids:
if skip_special_tokens and index in self.all_special_ids:
continue
if index in self.added_tokens_decoder:
tokens.append(self.added_tokens_decoder[index])
else:
tokens.append(self._convert_id_to_token(index))
return tokens
def _convert_id_to_token(self, index: int) -> str:
raise NotImplementedError
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""
Converts a sequence of tokens to a single string. The most simple way to do it is
``" ".join(tokens)`` but we often want to remove sub-word tokenization artifacts
at the same time.
Args:
tokens (:obj:`List[str]`): The token to join in a string.
Returns:
:obj:`str`: The joined tokens.
"""
return " ".join(tokens)
def decode(
self,
token_ids,
skip_special_tokens=False,
clean_up_tokenization_spaces=True,
spaces_between_special_tokens: bool = True,
):
"""
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Args:
token_ids: list of tokenized input ids. Can be obtained using the `encode` or
`encode_plus` methods.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
"""
# Convert inputs to python lists
if isinstance(token_ids, flow.Tensor):
token_ids = token_ids.tolist()
filtered_tokens = self.convert_ids_to_tokens(
token_ids, skip_special_tokens=skip_special_tokens
)
# To avoid mixing byte-level and unicode for byte-level BPT
# we need to build string separately for added tokens and byte-level tokens
# cf. https://github.com/huggingface/transformers/issues/1133
sub_texts = []
current_sub_text = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
if token in self.added_tokens_encoder:
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
current_sub_text = []
sub_texts.append(token)
else:
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
if spaces_between_special_tokens:
text = " ".join(sub_texts)
else:
text = "".join(sub_texts)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
@property
def bos_token(self) -> str:
"""
:obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
"""
if self._bos_token is None and self.verbose:
logger.error("Using bos_token, but it is not set yet.")
return None
return str(self._bos_token)
@property
def eos_token(self) -> str:
"""
:obj:`str`: End of sentence token. Log an error if used while not having been set.
"""
if self._eos_token is None and self.verbose:
logger.error("Using eos_token, but it is not set yet.")
return None
return str(self._eos_token)
@property
def unk_token(self) -> str:
"""
:obj:`str`: Unknown token. Log an error if used while not having been set.
"""
if self._unk_token is None and self.verbose:
logger.error("Using unk_token, but it is not set yet.")
return None
return str(self._unk_token)
@property
def sep_token(self) -> str:
"""
:obj:`str`: Separation token, to separate context and query in an input sequence.
Log an error if used while not having been set.
"""
if self._sep_token is None and self.verbose:
logger.error("Using sep_token, but it is not set yet.")
return None
return str(self._sep_token)
@property
def pad_token(self) -> str:
"""
:obj:`str`: Padding token. Log an error if used while not having been set.
"""
if self._pad_token is None and self.verbose:
logger.error("Using pad_token, but it is not set yet.")
return None
return str(self._pad_token)
@property
def cls_token(self) -> str:
"""
:obj:`str`: Classification token, to extract a summary of an input sequence leveraging
self-attention along the full depth of the model.
Log an error if used while not having been set.
"""
if self._cls_token is None and self.verbose:
logger.error("Using cls_token, but it is not set yet.")
return None
return str(self._cls_token)
@property
def mask_token(self) -> str:
"""
:obj:`str`: Mask token, to use when training a model with masked-language modeling.
Log an error if used while not having been set.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
@property
def eod_token(self) -> str:
"""
:obj:`str`: End of document token. Log an error if used while not having been set.
"""
if self._eod_token is None and self.verbose:
logger.error("Using eod_token, but it is not set yet.")
return None
return str(self._eod_token)
@property
def start_token(self) -> str:
"""
:obj:`str`: Start token of sentence. Common name for bos_token and cls_token.
"""
if self._bos_token is not None and self._cls_token is not None:
if self._bos_token == self._cls_token:
return str(self._bos_token)
else:
logger.error("Conflict between bos_token and cls_token.")
return None
elif self._bos_token is None and self._cls_token is not None:
return str(self._cls_token)
elif self._bos_token is not None and self._cls_token is None:
return str(self._bos_token)
else:
logger.error("Using start_token, but it is not set yet.")
return None
@property
def end_token(self) -> str:
"""
:obj:`str`: End token of sentence. Common name for eos_token and sep_token.
Note: eod_token is not considered, because it is often same with eos_token.
"""
if self._eos_token is not None and self._sep_token is not None:
if self._eos_token == self._sep_token:
return str(self._eos_token)
else:
logger.error("Conflict between eos_token and _sep_token.")
return None
elif self._eos_token is None and self._sep_token is not None:
return str(self._sep_token)
elif self._eos_token is not None and self._sep_token is None:
return str(self._eos_token)
else:
logger.error("Using end_token, but it is not set yet.")
return None
@property
def additional_special_tokens(self) -> List[str]:
"""
:obj:`List[str]`: All the additional special tokens you may want to use.
Log an error if used while not having been set.
"""
if self._additional_special_tokens is None and self.verbose:
logger.error("Using additional_special_tokens, but it is not set yet.")
return None
return [str(tok) for tok in self._additional_special_tokens]
@bos_token.setter
def bos_token(self, value):
self._bos_token = value
@eos_token.setter
def eos_token(self, value):
self._eos_token = value
@unk_token.setter
def unk_token(self, value):
self._unk_token = value
@sep_token.setter
def sep_token(self, value):
self._sep_token = value
@pad_token.setter
def pad_token(self, value):
self._pad_token = value
@cls_token.setter
def cls_token(self, value):
self._cls_token = value
@mask_token.setter
def mask_token(self, value):
self._mask_token = value
@eod_token.setter
def eod_token(self, value):
self._eod_token = value
@additional_special_tokens.setter
def additional_special_tokens(self, value):
self._additional_special_tokens = value
@property
def bos_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
if self._bos_token is None:
return None
return self.convert_tokens_to_ids(self.bos_token)
@property
def eos_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the end of sentence token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
if self._eos_token is None:
return None
return self.convert_tokens_to_ids(self.eos_token)
@property
def unk_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the unknown token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
if self._unk_token is None:
return None
return self.convert_tokens_to_ids(self.unk_token)
@property
def sep_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the separation token in the vocabulary,
to separate context and query in an input sequence.
Returns :obj:`None` if the token has not been set.
"""
if self._sep_token is None:
return None
return self.convert_tokens_to_ids(self.sep_token)
@property
def pad_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the padding token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
if self._pad_token is None:
return None
return self.convert_tokens_to_ids(self.pad_token)
@property
def cls_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the classification token in the vocabulary,
to extract a summary of an input sequence leveraging self-attention
along the full depth of the model.
Returns :obj:`None` if the token has not been set.
"""
if self._cls_token is None:
return None
return self.convert_tokens_to_ids(self.cls_token)
@property
def mask_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a
model with masked-language modeling. Returns :obj:`None` if the token has not been set.
"""
if self._mask_token is None:
return None
return self.convert_tokens_to_ids(self.mask_token)
@property
def eod_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the end of document token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
if self._eod_token is None:
return None
return self.convert_tokens_to_ids(self.eod_token)
@property
def start_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the start token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
start_token = self.start_token
if start_token is None:
return None
else:
return self.convert_tokens_to_ids(start_token)
@property
def end_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the end token in the vocabulary.
Returns :obj:`None` if the token has not been set.
"""
end_token = self.end_token
if end_token is None:
return None
else:
return self.convert_tokens_to_ids(end_token)
@property
def additional_special_tokens_ids(self) -> List[int]:
"""
:obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
Log an error if used while not having been set.
"""
return self.convert_tokens_to_ids(self.additional_special_tokens)
@property
def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
"""
A dictionary mapping special token class attributes
(:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
"""
set_attr = {}
for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
attr_value = getattr(self, "_" + attr)
if attr_value:
set_attr[attr] = attr_value
return set_attr
@property
def all_special_tokens(self) -> List[str]:
"""
:obj:`List[str]`: All the special tokens
(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
"""
all_toks = []
set_attr = self.special_tokens_map
for attr_value in set_attr.values():
all_toks = all_toks + (
list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]
)
all_toks = list(set(all_toks))
return all_toks
@property
def all_special_ids(self) -> List[int]:
"""
:obj:`List[int]`: List the ids of the special tokens
(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
"""
all_toks = self.all_special_tokens
all_ids = list(self.convert_tokens_to_ids(all_toks))
return all_ids
@staticmethod
def clean_up_tokenization(out_string):
"""Clean up a list of simple English tokenization artifacts like spaces before
punctuations and abbreviated forms.
"""
out_string = (
out_string.replace(" .", ".")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ,", ",")
.replace(" ' ", "'")
.replace(" n't", "n't")
.replace(" 'm", "'m")
.replace(" do not", " don't")
.replace(" 's", "'s")
.replace(" 've", "'ve")
.replace(" 're", "'re")
)
return out_string
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for bert (wordpieces)."""
import collections
import logging
import os
import re
import unicodedata
from io import open
from typing import List, Optional
from .tokenization_base import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
"bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
"bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"bert-base-uncased": 512,
"bert-large-uncased": 512,
"bert-base-cased": 512,
"bert-large-cased": 512,
"bert-base-chinese": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"bert-base-uncased": {"do_lower_case": True},
"bert-large-uncased": {"do_lower_case": True},
"bert-base-cased": {"do_lower_case": False},
"bert-large-cased": {"do_lower_case": False},
"bert-base-chinese": {"do_lower_case": False},
}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def _is_chinese_substr(char):
return re.findall("##[\u4E00-\u9FA5]", char)
class BertTokenizer(PreTrainedTokenizer):
"""
Construct a BERT tokenizer. Based on WordPiece.
Args:
vocab_file (:obj:`str`):
Path to a one-wordpiece-per-line vocabulary file.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to do basic tokenization before wordpiece.
never_split (:obj:`Iterable`, `optional`):
List of tokens which will never be split during tokenization.
Only has an effect when do_basic_tokenize=True.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese,
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328.
do_chinese_wwm (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to do whole word masking for Chinese.
Chinese sentence will be segmented by a third-party tool first.
Each substr will be added '##' prefix and its index will be calucated by
id(##A) = id(A) + vocab_size.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
do_chinese_wwm=False,
add_bos_token=False,
**kwargs,
):
super(BertTokenizer, self).__init__(
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the "
"vocabulary from a Google pretrained model use "
"`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
vocab_file
)
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()]
)
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
if do_chinese_wwm:
self.basic_tokenizer = BasicTokenizerWithChineseWWM(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
)
else:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
self.add_bos_token = add_bos_token
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.
For Chinese substr, id = vocab_size + id(substr.remove(##)).
"""
index = self.vocab.get(token, self.vocab.get(self.unk_token))
return index
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.
For Chinese substr, id = vocab_size + id(substr.remove(##)).
"""
token = self.ids_to_tokens.get(index, self.unk_token)
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) to a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""Add special tokens to a sequence or a pair of sequence.
BERT format sentence input:
- single sequence: [CLS] tokens_a [SEP]
- pair of sequences: [CLS] tokens_a [SEP] tokens_b [SEP]
Args:
token_ids_0 (List[int]): The token ids of sentence 0.
token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
Returns:
:obj:`List[str]`: The sequence after adding special toekens.
"""
if self.add_bos_token:
cls = [self.cls_token_id]
sep = [self.sep_token_id]
else:
cls = []
sep = []
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save the tokenizer vocabulary to a directory or file."""
index = 0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "")
+ VOCAB_FILES_NAMES["vocab_file"],
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
"Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file)
)
index = token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
class BasicTokenizer(object):
"""
Constructs a BasicTokenizer that will run basic
tokenization (punctuation splitting, lower casing, etc.).
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
"""Constructs a BasicTokenizer.
Args:
**do_lower_case**: Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level
(see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text.
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
Args:
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level
(see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class BasicTokenizerWithChineseWWM(BasicTokenizer):
"""Pre-segmentation for Chinese sentences, which will be used in whole word mask."""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
super(BasicTokenizerWithChineseWWM, self).__init__(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
)
try:
import jieba
self.pre_tokenizer = lambda x: jieba.lcut(x, HMM=False)
except ImportError:
raise (ImportError("Chinese whole word mask need jieba"))
def _tokenize_chinese_chars(self, text):
"""For Chinese pieces, uses jieba to segment the words and
adds whitespace around CJK character."""
output = []
piece = ""
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
piece += char
else:
chinese_words = self.pre_tokenizer(piece)
for word in chinese_words:
output.append(" ")
output.append(word)
output.append(" ")
output.append(char)
piece = ""
chinese_words = self.pre_tokenizer(piece)
for word in chinese_words:
output.append(" ")
output.append(word)
output.append(" ")
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
input = "有没有"
output = ["有", "##没", "##有"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr.startswith("##"):
if _is_chinese_substr(substr):
if substr[2:] in self.vocab: # for Chinese substr
cur_substr = substr
break
else:
if substr in self.vocab: # for English substr
cur_substr = substr
break
else:
if (
substr in self.vocab
): # non-substr, maybe character or whole Chinese word
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT (BPE)."""
import json
import logging
import os
from functools import lru_cache
from io import open
from typing import List, Optional
import regex as re
from .tokenization_base import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json"},
"merges_file": {"gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"gpt2": 1024,
}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping
to whitespace/control characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode
characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token
dataset you end up needing around 5K for decent coverage. This is a significant percentage
of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8
bytes and unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1))
+ list(range(ord("¡"), ord("¬") + 1))
+ list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2 ** 8):
if b not in bs:
bs.append(b)
cs.append(2 ** 8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class GPT2Tokenizer(PreTrainedTokenizer):
"""
Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be
converted to an ID and is set to be this token instead.
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_bos_token=False,
**kwargs,
):
super(GPT2Tokenizer, self).__init__(
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
)
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for
# capitalized versions of contractions
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
self.add_bos_token = add_bos_token
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except: # noqa
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
# Maps all our bytes to unicode strings, avoiding control tokens
# of the BPE (spaces in our case)
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) to a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""Add special tokens to a sequence or a pair of sequence.
GPT2 format sentence input:
- single sequence: <|endoftext|> tokens_a
- pair of sequences: <|endoftext|> tokens_a <|endoftext|> tokens_b
Args:
token_ids_0 (List[int]): The token ids of sentence 0.
token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
Returns:
:obj:`List[str]`: The sequence after adding special toekens.
"""
if self.add_bos_token:
bos = [self.bos_token_id]
else:
bos = []
if token_ids_1 is None:
return bos + token_ids_0
return bos + token_ids_0 + bos + token_ids_1
def save_vocabulary(self, save_directory, filename_prefix=None):
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
)
merge_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"],
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return (vocab_file, merge_file)
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RoBERTa."""
import json
import logging
import os
from functools import lru_cache
from typing import List, Optional, Tuple
import regex as re
from .tokenization_base import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
},
"merges_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"roberta-base": 512,
"roberta-large": 512,
"roberta-large-mnli": 512,
}
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to
whitespace/control characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. This means you need a large # of unicode
characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token
dataset you end up needing around 5K for decent coverage. This is a significant percentage of
your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
unicode strings.
"""
bs = (
list(range(ord("!"), ord("~") + 1))
+ list(range(ord("¡"), ord("¬") + 1))
+ list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2 ** 8):
if b not in bs:
bs.append(b)
cs.append(2 ** 8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class RobertaTokenizer(PreTrainedTokenizer):
"""Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer,
using byte-level Byte-Pair-Encoding.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
bos_token (:obj:`str`, `optional`, defaults to `<s>`):
The beginning of sequence token.
eos_token (:obj:`str`, `optional`, defaults to `</s>`):
The end of sequence token.
cls_token (:obj:`str`, `optional`, defaults to `<s>`):
The first token of the sequence when built with special tokens.
unk_token (:obj:`str`, `optional`, defaults to `<unk>`):
The unknown token. A token that is not in the vocabulary cannot be
converted to an ID and is set to be this token instead.
pad_token (:obj:`str`, `optional`, defaults to `<pad>`): A special token
used to make arrays of tokens the same size for batching purpose.
Will then be ignored by attention mechanisms or loss computation.
mask_token (:obj:`str`, `optional`, defaults to `<mask>`): A special token
representing a masked token (used by masked-language modeling pretraining
objectives, like BERT).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_bos_token=False,
**kwargs,
):
super(RobertaTokenizer, self).__init__(
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
with open(vocab_file, encoding="utf-8") as file:
self.encoder = json.load(file)
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding="utf-8") as file:
bpe_merges = file.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
self.add_bos_token = add_bos_token
@property
def vocab_size(self):
return len(self.encoder)
def get_vocab(self):
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""Tokenize a string."""
bpe_tokens = []
for token in re.findall(self.pat, text):
token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""Add special tokens to a sequence or a pair of sequence.
RoBERTa format sentence input:
- single sequence: [CLS] tokens_a [SEP]
- pair of sequences: [CLS] tokens_a [SEP] tokens_b [SEP]
Args:
token_ids_0 (List[int]): The token ids of sentence 0.
token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
Returns:
:obj:`List[str]`: The sequence after adding special toekens.
"""
if self.add_bos_token:
cls = [self.cls_token_id]
sep = [self.sep_token_id]
else:
cls = []
sep = []
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
)
merge_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"],
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair
classification task. RoBERTa does not make use of token type ids, therefore
a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
# coding=utf-8
# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for Google T5 (sentence piece)."""
import logging
import os
import warnings
from shutil import copyfile
from typing import List, Optional
import regex as re
import sentencepiece as spm
from .tokenization_base import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"t5-base": "https://huggingface.co/t5-base/resolve/main/spiece.model"}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"t5-base": 512,
}
class T5Tokenizer(PreTrainedTokenizer):
"""
Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot
be converted to an ID and is set to be this token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`int`, `optional`, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use
as sentinels. These tokens are accessible as "<extra_id_{%d}>" where
"{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed
from the end of the vocabulary up to beginning ("<extra_id_0>" is the
last token in the vocabulary like in T5 preprocessing see `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
add_bos_token=False,
**kwargs,
):
# Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and additional_special_tokens is not None:
extra_tokens = len(
set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))
)
if extra_tokens != extra_ids:
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens "
f"({additional_special_tokens}) are privided to T5Tokenizer. "
"In this case the additional_special_tokens must include the extra_ids tokens"
)
super().__init__(
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self._extra_ids = extra_ids
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
self.add_bos_token = add_bos_token
@property
def vocab_size(self):
return self.sp_model.get_piece_size() + self._extra_ids
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Tokenize a string."""
pieces = self.sp_model.encode(text, out_type=str)
return pieces
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
if token.startswith("<extra_id_"):
match = re.match(r"<extra_id_(\d+)>", token)
num = int(match.group(1))
return self.vocab_size - num - 1
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index < self.sp_model.get_piece_size():
token = self.sp_model.IdToPiece(index)
else:
token = f"<extra_id_{self.vocab_size - 1 - index}>"
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) to a single string."""
current_sub_tokens = []
out_string = ""
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
current_sub_tokens = []
else:
current_sub_tokens.append(token)
out_string += self.sp_model.decode_pieces(current_sub_tokens)
return out_string.strip()
def _add_eos_if_not_present(self, token_ids):
if not self.add_bos_token:
return token_ids
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
warnings.warn("This sequence already has {self.eos_token}.")
return token_ids
else:
return token_ids + [self.eos_token_id]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""Add special tokens to a sequence or a pair of sequence.
T5 format sentence input:
- single sequence: tokens_a </s>
- pair of sequences: tokens_a </s> tokens_b </s>
Args:
token_ids_0 (List[int]): The token ids of sentence 0.
token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
Returns:
:obj:`List[str]`: The sequence after adding special toekens.
"""
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
return token_ids_0
else:
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
return token_ids_0 + token_ids_1
def save_vocabulary(self, save_directory, filename_prefix=None):
"""Save the tokenizer vocabulary to a directory or file."""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
logger.info(f"Copy vocab file to {out_vocab_file}")
return (out_vocab_file,)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment