bert-large

5988d2cc · yuguo960516 · 478602ba · 5988d2cc · 5988d2cc · 5988d2cc
Commit 5988d2cc authored Mar 29, 2023 by yuguo960516
20 changed files
--- a/libai/tokenizer/__pycache__/tokenization_base.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/tokenization_base.cpython-39.pyc
--- a/libai/tokenizer/__pycache__/tokenization_bert.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/tokenization_bert.cpython-39.pyc
--- a/libai/tokenizer/__pycache__/tokenization_gpt2.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/tokenization_gpt2.cpython-39.pyc
--- a/libai/tokenizer/__pycache__/tokenization_roberta.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/tokenization_roberta.cpython-39.pyc
--- a/libai/tokenizer/__pycache__/tokenization_t5.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/tokenization_t5.cpython-39.pyc
--- a/libai/tokenizer/build.py
+++ b/libai/tokenizer/build.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from libai.config import instantiate
+logger = logging.getLogger(__name__)
+def build_tokenizer(cfg):
+    """Initialize tokenizer."""
+    tokenizer = instantiate(cfg.tokenizer)
+    if cfg.append_eod and tokenizer.eod_token is None:
+        if tokenizer.eos_token is not None:
+            tokenizer.eod_token = tokenizer.eos_token
+        else:
+            tokenizer.eod_token = tokenizer.pad_token
+    return tokenizer
--- a/libai/tokenizer/tokenization_base.py
+++ b/libai/tokenizer/tokenization_base.py
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""copy from HuggingFace transformer repo, to tokenize the sentence.
+This class only focus on tokenization, converting token to id and their inverse operation.
+It does not construct inputs using special symbols."""
+import copy
+import itertools
+import json
+import logging
+import os
+import unicodedata
+from io import open
+from typing import Dict, List, Optional, Union
+import numpy as np
+import oneflow as flow
+from libai.utils import distributed as dist
+from libai.utils.file_io import PathManager
+from libai.utils.file_utils import cached_path
+logger = logging.getLogger(__name__)
+def _is_whitespace(char):
+    """Checks whether `char` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `char` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `char` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (
+        (cp >= 33 and cp <= 47)
+        or (cp >= 58 and cp <= 64)
+        or (cp >= 91 and cp <= 96)
+        or (cp >= 123 and cp <= 126)
+    ):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+class PreTrainedTokenizer(object):
+    """
+    Base class for all tokenizers.
+    Handle all the shared methods for tokenization and special tokens, methods
+    dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+    This class also contains the added tokens in a unified way on top of all tokenizers, so we don't
+    have to handle the specific vocabulary augmentation methods of the various underlying
+    dictionary structures (BPE, sentencepiece...).
+    Class attributes (overridden by derived classes):
+        ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of
+        each vocabulary file required by the model, and as associated values, the filename for
+        saving the associated file (string).
+        ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the
+        ``__init__`` keyword name of each vocabulary file required by the model, the low-level
+        being the `short-cut-names` (string) of the pretrained models with, as associated values,
+        the `url` (string) to the associated pretrained vocabulary file.
+        ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string)
+        of the pretrained models, and as associated values, the maximum length of the sequence
+        inputs of this model, or None if the model has no maximum input size.
+        ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names`
+        (string) of the pretrained models, and as associated values, a dictionnary of specific
+        arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained
+        model when loading the tokenizer with the ``from_pretrained()`` method.
+    Args:
+        bos_token (:obj:`str`, `optional`): A special token representing the beginning of a
+            sentence.
+        eos_token (:obj:`str`, `optional`): A special token representing the end of a sentence.
+        unk_token (:obj:`str`, `optional`): A special token representing an out-of-vocabulary token.
+        sep_token (:obj:`str`, `optional`): A special token separating two different sentences in
+            the same input (used by BERT for instance).
+        pad_token (:obj:`str`, `optional`): A special token used to make arrays of tokens the same
+            size for batching purpose.
+            Will then be ignored by attention mechanisms or loss computation.
+        cls_token (:obj:`str`, `optional`): A special token representing the class of the input
+            (used by BERT for instance).
+        mask_token (:obj:`str`, `optional`): A special token representing a masked token (used by
+            masked-language modeling pretraining objectives, like BERT).
+        eod_token (:obj:`str`, `optional`): A special token representing the end of a document.
+            additional_special_tokens (tuple or list of :obj:`str`, `optional`):
+            A tuple or a list of additional special tokens.
+    """
+    vocab_files_names = {}
+    pretrained_vocab_files_map = {}
+    pretrained_init_configuration = {}
+    max_model_input_sizes = {}
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "eod_token",
+        "additional_special_tokens",
+    ]
+    def __init__(self, verbose=True, **kwargs):
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._eod_token = None
+        self._additional_special_tokens = []
+        self.verbose = verbose
+        # Added tokens - We store this for both slow and fast tokenizers
+        # until the serialization of Fast tokenizers is updated
+        self.added_tokens_encoder: Dict[str, int] = {}
+        self.added_tokens_decoder: Dict[int, str] = {}
+        self.unique_no_split_tokens: List[str] = []
+        # inputs and kwargs for saving and re-loading
+        # (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+        self.init_kwargs = {}
+        # We directly set the hidden value to allow initialization with special tokens
+        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
+        for key, value in kwargs.items():
+            if value is None:
+                continue
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == "additional_special_tokens":
+                    assert all(
+                        isinstance(t, str) for t in value
+                    ), "One of the tokens is not a string"
+                    setattr(self, key, list(value))
+                elif isinstance(value, str):
+                    setattr(self, key, value)
+                else:
+                    raise TypeError(f"special token {key} has to be str but got: {type(value)}")
+    @classmethod
+    def from_pretrained(cls, *inputs, **kwargs):
+        r"""
+        Instantiate a :class:`~PreTrainedTokenizer` (or a derived class) from a
+        predefined tokenizer.
+        Args:
+            pretrained_model_name_or_path(`str` or `os.PathLike`):
+                Can be either:
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache
+                  or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer,
+                  for instance saved using the :func:`~PreTrainedTokenizer.save_pretrained`
+                  method, e.g., ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved
+                  vocabulary file if and only if the tokenizer only requires a single vocabulary
+                  file (e.g. Bert, XLNet), e.g., ``./my_model_directory/vocab.txt``.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files
+                should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if
+                they exist.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint,
+                e.g., {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            inputs: (`optional`) positional arguments: will be passed to the
+                Tokenizer ``__init__`` method.
+            kwargs: (`optional`) keyword arguments: will be passed to the
+                Tokenizer ``__init__`` method. Can be used to set special tokens
+                like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``,
+                ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``.
+                See parameters in the doc string of :class:`~PreTrainedTokenizer`
+                for details.
+        Examples:
+        .. code-block:: python
+            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's
+            # show our examples on a derived class: BertTokenizer
+            # Download vocabulary from S3 and cache.
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            # If vocabulary files are in a directory (e.g. tokenizer was
+            # saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+            # If the tokenizer uses a single vocabulary file, you can point directly to this file
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+            # You can link tokens to special vocabulary when instantiating
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+            # You should be sure '<unk>' is in the vocabulary when doing that.
+            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+            assert tokenizer.unk_token == '<unk>'
+        """
+        return cls._from_pretrained(*inputs, **kwargs)
+    @classmethod
+    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        s3_models = list(cls.max_model_input_sizes.keys())
+        vocab_files = {}
+        init_configuration = {}
+        if pretrained_model_name_or_path in s3_models:
+            # Get the vocabulary from AWS S3 bucket
+            for file_id, map_list in cls.pretrained_vocab_files_map.items():
+                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+            if (
+                cls.pretrained_init_configuration
+                and pretrained_model_name_or_path in cls.pretrained_init_configuration
+            ):
+                init_configuration = cls.pretrained_init_configuration[
+                    pretrained_model_name_or_path
+                ]
+        else:
+            # Get the vocabulary from local files
+            logger.info(
+                "Model name '{}' not found in model shortcut name list ({}). "
+                "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
+                    pretrained_model_name_or_path,
+                    ", ".join(s3_models),
+                    pretrained_model_name_or_path,
+                )
+            )
+            # Look for the tokenizer main vocabulary files
+            for file_id, file_name in cls.vocab_files_names.items():
+                if os.path.isdir(pretrained_model_name_or_path):
+                    # If a directory is provided we look for the standard filenames
+                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                else:
+                    # If a path to a file is provided we use it (will only work for non-BPE
+                    # tokenizer using a single vocabulary file)
+                    full_file_name = pretrained_model_name_or_path
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+            # Look for the additional tokens files
+            additional_files_names = {
+                "added_tokens_file": ADDED_TOKENS_FILE,
+                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+            }
+            # If a path to a file was provided, get the parent directory
+            saved_directory = pretrained_model_name_or_path
+            if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
+                saved_directory = os.path.dirname(saved_directory)
+            for file_id, file_name in additional_files_names.items():
+                full_file_name = os.path.join(saved_directory, file_name)
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+            if all(full_file_name is None for full_file_name in vocab_files.values()):
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find tokenizer files"
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                    )
+                )
+                return None
+        # Get files from url, cache, or disk depending on the case
+        try:
+            resolved_vocab_files = {}
+            for file_id, file_path in vocab_files.items():
+                if file_path is None:
+                    resolved_vocab_files[file_id] = None
+                else:
+                    resolved_vocab_files[file_id] = cached_path(
+                        file_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                    )
+        except EnvironmentError as e:
+            if pretrained_model_name_or_path in s3_models:
+                logger.error("Couldn't reach server to download vocabulary.")
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                        str(vocab_files.keys()),
+                    )
+                )
+            raise e
+        for file_id, file_path in vocab_files.items():
+            if file_path == resolved_vocab_files[file_id]:
+                logger.info("loading file {}".format(file_path))
+            else:
+                logger.info(
+                    "loading file {} from cache at {}".format(
+                        file_path, resolved_vocab_files[file_id]
+                    )
+                )
+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
+        if tokenizer_config_file is not None:
+            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            saved_init_inputs = init_kwargs.pop("init_inputs", ())
+            if not init_inputs:
+                init_inputs = saved_init_inputs
+        else:
+            init_kwargs = init_configuration
+        # Update with newly provided kwargs
+        init_kwargs.update(kwargs)
+        # Merge resolved_vocab_files arguments in init_kwargs.
+        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
+        for args_name, file_path in resolved_vocab_files.items():
+            if args_name not in init_kwargs:
+                init_kwargs[args_name] = file_path
+        if special_tokens_map_file is not None:
+            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            for key, value in special_tokens_map.items():
+                if key not in init_kwargs:
+                    init_kwargs[key] = value
+        # Instantiate tokenizer.
+        tokenizer = cls(*init_inputs, **init_kwargs)
+        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+        tokenizer.init_inputs = init_inputs
+        tokenizer.init_kwargs = init_kwargs
+        # Add supplementary tokens.
+        special_tokens = tokenizer.all_special_tokens
+        if added_tokens_file is not None:
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
+            # Sort added tokens by index
+            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
+            for token, index in added_tok_encoder_sorted:
+                assert index == len(tokenizer), (
+                    f"Non-consecutive added token '{token}' found. "
+                    f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
+                )
+                tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
+        # Check all our special tokens are registered as "no split" token
+        # (we don't cut them) and are in the vocab
+        added_tokens = tokenizer.sanitize_special_tokens()
+        if added_tokens:
+            logger.warning(
+                "Special tokens have been added in the vocabulary,"
+                "make sure the associated word embedding are fine-tuned or trained."
+            )
+        return tokenizer
+    def save_pretrained(self, save_directory):
+        """
+        Save the tokenizer vocabulary files together with:
+            - added tokens,
+            - special-tokens-to-class-attributes-mapping,
+            - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+        This won't save modifications other than ``added tokens`` and ``special token mapping``,
+        you may have applied to the tokenizer after the instantiation (e.g. modifying
+        tokenizer.do_lower_case after creation).
+        This method make sure the full tokenizer can then be re-loaded using the
+        :func:`~PreTrainedTokenizer.from_pretrained` class method.
+        """
+        if not PathManager.isdir(save_directory):
+            logger.error("Saving directory ({}) should be a directory".format(save_directory))
+            return
+        PathManager.mkdirs(save_directory)
+        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
+        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        if len(self.init_inputs) > 0:
+            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
+        for file_id in self.vocab_files_names.keys():
+            tokenizer_config.pop(file_id, None)
+        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
+        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
+        added_vocab = self.get_added_vocab()
+        if added_vocab:
+            with open(added_tokens_file, "w", encoding="utf-8") as f:
+                out_str = json.dumps(added_vocab, ensure_ascii=False)
+                f.write(out_str)
+        vocab_files = self.save_vocabulary(save_directory)
+        return vocab_files + (special_tokens_map_file, added_tokens_file)
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
+        and special token mappings.
+        Please use :func:`~PreTrainedTokenizer.save_pretrained` to save the
+        full Tokenizer state if you want to reload it using the
+        :func:`~PreTrainedTokenizer.from_pretrained` class method.
+        """
+        raise NotImplementedError
+    @property
+    def vocab_size(self) -> int:
+        """Size of the base vocabulary (without the added tokens)."""
+        raise NotImplementedError
+    def padded_vocab_size(self, multiple=1) -> int:
+        """Padded the vocabulary with dummy tokens and return the new size."""
+        vocab_size = len(self)
+        while vocab_size % multiple != 0:
+            vocab_size += 1
+        return vocab_size
+    def __len__(self):
+        """Size of the full vocabulary with the added tokens."""
+        return self.vocab_size + len(self.added_tokens_encoder)
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
+        :obj:`tokenizer.get_vocab()[token]` is equivalent to
+        :obj:`tokenizer.convert_tokens_to_ids(token)`
+        when :obj:`token` is in the vocab.
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary.
+        """
+        raise NotImplementedError
+    def get_added_vocab(self) -> Dict[str, int]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of token to index.
+        Returns:
+            :obj:`Dict[str, int]`: The added tokens.
+        """
+        return self.added_tokens_encoder
+    def add_tokens(self, new_tokens: Union[str, List[str]], special_tokens: bool = False) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        vocabulary, they are added to it with indices starting from the length of
+        the current vocabulary.
+        .. Note::
+            When adding new tokens to the vocabulary, you should make sure to also resize
+            the token embedding matrix of the model so that its embedding matrix matches
+            the tokenizer.
+            In order to do that, please use the
+            :meth:`~PreTrainedModel.resize_token_embeddings` method.
+        Args:
+            new_tokens (:obj:`str`, or a list of `str`):
+                Tokens are only added if they are not already in the vocabulary.
+            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Can be used to specify if the token is a special token. This mostly change
+                the normalization behavior
+                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
+        Returns:
+            :obj:`int`: Number of tokens added to the vocabulary.
+        Examples:
+        .. code-block:: python
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+             # Notice: resize_token_embeddings expect to receive the full size of the new
+             # vocabulary, i.e., the length of the tokenizer.
+            model.resize_token_embeddings(len(tokenizer))
+        """
+        if not new_tokens:
+            return 0
+        if not isinstance(new_tokens, (list, tuple)):
+            new_tokens = [new_tokens]
+        tokens_to_add = []
+        for token in new_tokens:
+            if not isinstance(token, str):
+                raise TypeError(f"Token {token} is not a string but a {type(token)}.")
+            if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
+                token = token.lower()
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in tokens_to_add
+            ):
+                tokens_to_add.append(token)
+                if self.verbose:
+                    logger.info(f"Adding {token} to the vocabulary")
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+        if special_tokens:
+            self.unique_no_split_tokens = sorted(
+                set(self.unique_no_split_tokens).union(set(new_tokens))
+            )
+        else:
+            self.unique_no_split_tokens = sorted(
+                set(self.unique_no_split_tokens).union(set(tokens_to_add))
+            )
+        return len(tokens_to_add)
+    def sanitize_special_tokens(self) -> int:
+        """
+        Make sure that all the special tokens attributes of the tokenizer
+        (:obj:`tokenizer.mask_token`, :obj:`tokenizer.cls_token`, etc.)
+        are in the vocabulary.
+        Add the missing ones to the vocabulary if needed.
+        Return:
+            :obj:`int`: The number of tokens added in the vocaulary during the operation.
+        """
+        return self.add_tokens(self.all_special_tokens, special_tokens=True)
+    def add_special_tokens(self, special_tokens_dict: Dict[str, str]) -> int:
+        """
+        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to
+        class attributes. If special tokens are NOT in the vocabulary, they are added to it
+        (indexed starting from the last index of the current vocabulary).
+        .. Note::
+            When adding new tokens to the vocabulary, you should make sure to also resize the
+            token embedding matrix of the model so that its embedding matrix matches the tokenizer.
+            In order to do that, please use the
+            :meth:`~PreTrainedModel.resize_token_embeddings` method.
+        Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
+        - Special tokens are carefully handled by the tokenizer (they are never split).
+        - You can easily refer to special tokens using tokenizer class attributes like
+        :obj:`tokenizer.cls_token`. This makes it easy to develop model-agnostic training and
+        fine-tuning scripts.
+        When possible, special tokens are already registered for provided pretrained models
+        (for instance :class:`~BertTokenizer` :obj:`cls_token` is already registered
+        to be :obj`'[CLS]'` and XLM's one is also registered to be :obj:`'</s>'`).
+        Args:
+            special_tokens_dict (dictionary `str` to `str`):
+                Keys should be in the list of predefined special attributes: [``bos_token``,
+                ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``,
+                ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].
+                Tokens are only added if they are not already in the vocabulary (tested by
+                checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Returns:
+            :obj:`int`: Number of tokens added to the vocabulary.
+        Examples:
+        .. code-block:: python
+            # Let's see how to add a new classification token to GPT-2
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            model = GPT2Model.from_pretrained('gpt2')
+            special_tokens_dict = {'cls_token': '<CLS>'}
+            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+            print('We have added', num_added_toks, 'tokens')
+            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary,
+            # i.e., the length of the tokenizer.
+            model.resize_token_embeddings(len(tokenizer))
+            assert tokenizer.cls_token == '<CLS>'
+        """
+        if not special_tokens_dict:
+            return 0
+        added_tokens = 0
+        for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
+            if self.verbose:
+                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
+            setattr(self, key, value)
+            if key == "additional_special_tokens":
+                assert isinstance(value, (list, tuple)) and all(
+                    isinstance(t, str) for t in value
+                ), f"Tokens {value} for key {key} should all be a string"
+                added_tokens += self.add_tokens(value, special_tokens=True)
+            else:
+                assert isinstance(value, str), f"Token {value} for key {key} should be a string"
+                added_tokens += self.add_tokens([value], special_tokens=True)
+        return added_tokens
+    def tokenize(self, text: str, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the tokenizer.
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Take care of added tokens.
+        Args:
+            text (:obj:`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific ``prepare_for_tokenization``
+                preprocessing method.
+        Returns:
+            :obj:`List[str]`: The list of tokens.
+        """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.strip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+        def split_on_tokens(tok_list, text):
+            if not text:
+                return []
+            if not tok_list:
+                return self._tokenize(text, **kwargs)
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.unique_no_split_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token)
+                        if token not in self.unique_no_split_tokens
+                        else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+        no_split_token = self.unique_no_split_tokens
+        tokenized_text = split_on_tokens(no_split_token, text)
+        return tokenized_text
+    def _tokenize(self, text, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for
+        word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        """Converts a token string (or a sequence of tokens) in a single integer id
+        (or a sequence of ids), using the vocabulary.
+        """
+        if tokens is None:
+            return None
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+        if len(tokens) > 0 and isinstance(tokens[0], list):
+            ids = []
+            for ts in tokens:
+                ids_x = []
+                for token in ts:
+                    ids_x.append(self._convert_token_to_id_with_added_voc(token))
+                ids.append(ids_x)
+            return ids
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+    def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, **kwargs):
+        if return_tensors is None:
+            return_token_ids = token_ids
+        elif return_tensors == "of":
+            if not is_global:
+                return_token_ids = flow.tensor(token_ids, dtype=flow.long)
+            elif is_global:
+                sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
+                placement = kwargs.get(
+                    "placement", flow.placement("cuda", list(range(dist.get_world_size())))
+                )
+                return_token_ids = flow.tensor(
+                    token_ids, sbp=sbp, placement=placement, dtype=flow.long
+                )
+        elif return_tensors == "np":
+            return_token_ids = np.array(token_ids, dtype=np.int64)
+        return return_token_ids
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+    def _convert_token_to_id(self, token):
+        raise NotImplementedError
+    def encode(self, text, return_tensors=None, is_global=False, **kwargs):
+        if isinstance(text, str):
+            tokens = self.tokenize(text)
+            token_ids = self.convert_tokens_to_ids(tokens)
+            token_ids = self.build_inputs_with_special_tokens(token_ids)
+            token_ids = self.convert_to_tensors(
+                token_ids, return_tensors=return_tensors, is_global=is_global, **kwargs
+            )
+            return token_ids
+        elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+            tokens = [self.tokenize(t) for t in text]
+            token_ids_list = self.convert_tokens_to_ids(tokens)
+            token_ids_list = [
+                self.build_inputs_with_special_tokens(token_ids) for token_ids in token_ids_list
+            ]
+            token_ids_list = self.convert_to_tensors(
+                token_ids_list, return_tensors=return_tensors, is_global=is_global, **kwargs
+            )
+            return token_ids_list
+        elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+            return text
+        else:
+            raise ValueError(
+                "Input is not valid. Should be a string, a list/tuple of strings or "
+                "a list/tuple of integers."
+            )
+    def convert_ids_to_tokens(
+        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
+    ) -> Union[str, List[str]]:
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens,
+        using the vocabulary and added tokens.
+        Args:
+            ids (:obj:`int` or :obj:`List[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to remove special tokens in the decoding.
+        Returns:
+            :obj:`str` or :obj:`List[str]`: The decoded token(s).
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+    def _convert_id_to_token(self, index: int) -> str:
+        raise NotImplementedError
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a sequence of tokens to a single string. The most simple way to do it is
+        ``" ".join(tokens)`` but we often want to remove sub-word tokenization artifacts
+        at the same time.
+        Args:
+            tokens (:obj:`List[str]`): The token to join in a string.
+        Returns:
+            :obj:`str`: The joined tokens.
+        """
+        return " ".join(tokens)
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens=False,
+        clean_up_tokenization_spaces=True,
+        spaces_between_special_tokens: bool = True,
+    ):
+        """
+        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        with options to remove special tokens and clean up tokenization spaces.
+        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+        Args:
+            token_ids: list of tokenized input ids. Can be obtained using the `encode` or
+            `encode_plus` methods.
+            skip_special_tokens: if set to True, will replace special tokens.
+            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
+        """
+        # Convert inputs to python lists
+        if isinstance(token_ids, flow.Tensor):
+            token_ids = token_ids.tolist()
+        filtered_tokens = self.convert_ids_to_tokens(
+            token_ids, skip_special_tokens=skip_special_tokens
+        )
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+        if spaces_between_special_tokens:
+            text = " ".join(sub_texts)
+        else:
+            text = "".join(sub_texts)
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    @property
+    def bos_token(self) -> str:
+        """
+        :obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
+        """
+        if self._bos_token is None and self.verbose:
+            logger.error("Using bos_token, but it is not set yet.")
+            return None
+        return str(self._bos_token)
+    @property
+    def eos_token(self) -> str:
+        """
+        :obj:`str`: End of sentence token. Log an error if used while not having been set.
+        """
+        if self._eos_token is None and self.verbose:
+            logger.error("Using eos_token, but it is not set yet.")
+            return None
+        return str(self._eos_token)
+    @property
+    def unk_token(self) -> str:
+        """
+        :obj:`str`: Unknown token. Log an error if used while not having been set.
+        """
+        if self._unk_token is None and self.verbose:
+            logger.error("Using unk_token, but it is not set yet.")
+            return None
+        return str(self._unk_token)
+    @property
+    def sep_token(self) -> str:
+        """
+        :obj:`str`: Separation token, to separate context and query in an input sequence.
+        Log an error if used while not having been set.
+        """
+        if self._sep_token is None and self.verbose:
+            logger.error("Using sep_token, but it is not set yet.")
+            return None
+        return str(self._sep_token)
+    @property
+    def pad_token(self) -> str:
+        """
+        :obj:`str`: Padding token. Log an error if used while not having been set.
+        """
+        if self._pad_token is None and self.verbose:
+            logger.error("Using pad_token, but it is not set yet.")
+            return None
+        return str(self._pad_token)
+    @property
+    def cls_token(self) -> str:
+        """
+        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging
+        self-attention along the full depth of the model.
+        Log an error if used while not having been set.
+        """
+        if self._cls_token is None and self.verbose:
+            logger.error("Using cls_token, but it is not set yet.")
+            return None
+        return str(self._cls_token)
+    @property
+    def mask_token(self) -> str:
+        """
+        :obj:`str`: Mask token, to use when training a model with masked-language modeling.
+        Log an error if used while not having been set.
+        """
+        if self._mask_token is None and self.verbose:
+            logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+    @property
+    def eod_token(self) -> str:
+        """
+        :obj:`str`: End of document token. Log an error if used while not having been set.
+        """
+        if self._eod_token is None and self.verbose:
+            logger.error("Using eod_token, but it is not set yet.")
+            return None
+        return str(self._eod_token)
+    @property
+    def start_token(self) -> str:
+        """
+        :obj:`str`: Start token of sentence. Common name for bos_token and cls_token.
+        """
+        if self._bos_token is not None and self._cls_token is not None:
+            if self._bos_token == self._cls_token:
+                return str(self._bos_token)
+            else:
+                logger.error("Conflict between bos_token and cls_token.")
+                return None
+        elif self._bos_token is None and self._cls_token is not None:
+            return str(self._cls_token)
+        elif self._bos_token is not None and self._cls_token is None:
+            return str(self._bos_token)
+        else:
+            logger.error("Using start_token, but it is not set yet.")
+            return None
+    @property
+    def end_token(self) -> str:
+        """
+        :obj:`str`: End token of sentence. Common name for eos_token and sep_token.
+        Note: eod_token is not considered, because it is often same with eos_token.
+        """
+        if self._eos_token is not None and self._sep_token is not None:
+            if self._eos_token == self._sep_token:
+                return str(self._eos_token)
+            else:
+                logger.error("Conflict between eos_token and _sep_token.")
+                return None
+        elif self._eos_token is None and self._sep_token is not None:
+            return str(self._sep_token)
+        elif self._eos_token is not None and self._sep_token is None:
+            return str(self._eos_token)
+        else:
+            logger.error("Using end_token, but it is not set yet.")
+            return None
+    @property
+    def additional_special_tokens(self) -> List[str]:
+        """
+        :obj:`List[str]`: All the additional special tokens you may want to use.
+        Log an error if used while not having been set.
+        """
+        if self._additional_special_tokens is None and self.verbose:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+            return None
+        return [str(tok) for tok in self._additional_special_tokens]
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+    @eod_token.setter
+    def eod_token(self, value):
+        self._eod_token = value
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._bos_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.bos_token)
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._eos_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eos_token)
+    @property
+    def unk_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the unknown token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._unk_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.unk_token)
+    @property
+    def sep_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the separation token in the vocabulary,
+        to separate context and query in an input sequence.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._sep_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.sep_token)
+    @property
+    def pad_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the padding token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._pad_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.pad_token)
+    @property
+    def cls_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the classification token in the vocabulary,
+        to extract a summary of an input sequence leveraging self-attention
+        along the full depth of the model.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._cls_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.cls_token)
+    @property
+    def mask_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a
+        model with masked-language modeling. Returns :obj:`None` if the token has not been set.
+        """
+        if self._mask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.mask_token)
+    @property
+    def eod_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the end of document token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._eod_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eod_token)
+    @property
+    def start_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the start token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        start_token = self.start_token
+        if start_token is None:
+            return None
+        else:
+            return self.convert_tokens_to_ids(start_token)
+    @property
+    def end_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the end token in the vocabulary.
+        Returns :obj:`None` if the token has not been set.
+        """
+        end_token = self.end_token
+        if end_token is None:
+            return None
+        else:
+            return self.convert_tokens_to_ids(end_token)
+    @property
+    def additional_special_tokens_ids(self) -> List[int]:
+        """
+        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
+        Log an error if used while not having been set.
+        """
+        return self.convert_tokens_to_ids(self.additional_special_tokens)
+    @property
+    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
+        """
+        A dictionary mapping special token class attributes
+        (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
+        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+    @property
+    def all_special_tokens(self) -> List[str]:
+        """
+        :obj:`List[str]`: All the special tokens
+        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (
+                list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]
+            )
+        all_toks = list(set(all_toks))
+        return all_toks
+    @property
+    def all_special_ids(self) -> List[int]:
+        """
+        :obj:`List[int]`: List the ids of the special tokens
+        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
+        """
+        all_toks = self.all_special_tokens
+        all_ids = list(self.convert_tokens_to_ids(all_toks))
+        return all_ids
+    @staticmethod
+    def clean_up_tokenization(out_string):
+        """Clean up a list of simple English tokenization artifacts like spaces before
+        punctuations and abbreviated forms.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" do not", " don't")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
--- a/libai/tokenizer/tokenization_bert.py
+++ b/libai/tokenizer/tokenization_bert.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for bert (wordpieces)."""
+import collections
+import logging
+import os
+import re
+import unicodedata
+from io import open
+from typing import List, Optional
+from .tokenization_base import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
+        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
+        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
+        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
+        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-chinese": 512,
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+}
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+def _is_chinese_substr(char):
+    return re.findall("##[\u4E00-\u9FA5]", char)
+class BertTokenizer(PreTrainedTokenizer):
+    """
+    Construct a BERT tokenizer. Based on WordPiece.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to a one-wordpiece-per-line vocabulary file.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lower case the input.
+            Only has an effect when do_basic_tokenize=True.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to do basic tokenization before wordpiece.
+        never_split (:obj:`Iterable`, `optional`):
+            List of tokens which will never be split during tokenization.
+            Only has an effect when do_basic_tokenize=True.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese,
+            see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328.
+        do_chinese_wwm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to do whole word masking for Chinese.
+            Chinese sentence will be segmented by a third-party tool first.
+            Each substr will be added '##' prefix and its index will be calucated by
+            id(##A) = id(A) + vocab_size.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        do_chinese_wwm=False,
+        add_bos_token=False,
+        **kwargs,
+    ):
+        super(BertTokenizer, self).__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a Google pretrained model use "
+                "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    vocab_file
+                )
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()]
+        )
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            if do_chinese_wwm:
+                self.basic_tokenizer = BasicTokenizerWithChineseWWM(
+                    do_lower_case=do_lower_case,
+                    never_split=never_split,
+                    tokenize_chinese_chars=tokenize_chinese_chars,
+                )
+            else:
+                self.basic_tokenizer = BasicTokenizer(
+                    do_lower_case=do_lower_case,
+                    never_split=never_split,
+                    tokenize_chinese_chars=tokenize_chinese_chars,
+                )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+        self.add_bos_token = add_bos_token
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab.
+        For Chinese substr, id = vocab_size + id(substr.remove(##)).
+        """
+        index = self.vocab.get(token, self.vocab.get(self.unk_token))
+        return index
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab.
+        For Chinese substr, id = vocab_size + id(substr.remove(##)).
+        """
+        token = self.ids_to_tokens.get(index, self.unk_token)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) to a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Add special tokens to a sequence or a pair of sequence.
+        BERT format sentence input:
+        - single sequence: [CLS] tokens_a [SEP]
+        - pair of sequences: [CLS] tokens_a [SEP] tokens_b [SEP]
+        Args:
+            token_ids_0 (List[int]): The token ids of sentence 0.
+            token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
+        Returns:
+            :obj:`List[str]`: The sequence after adding special toekens.
+        """
+        if self.add_bos_token:
+            cls = [self.cls_token_id]
+            sep = [self.sep_token_id]
+        else:
+            cls = []
+            sep = []
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "")
+                + VOCAB_FILES_NAMES["vocab_file"],
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic
+    tokenization (punctuation splitting, lower casing, etc.).
+    """
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+        """Constructs a BasicTokenizer.
+        Args:
+            **do_lower_case**: Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level
+                (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be deactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+        """
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text.
+        Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level
+                (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class BasicTokenizerWithChineseWWM(BasicTokenizer):
+    """Pre-segmentation for Chinese sentences, which will be used in whole word mask."""
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+        super(BasicTokenizerWithChineseWWM, self).__init__(
+            do_lower_case=do_lower_case,
+            never_split=never_split,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+        )
+        try:
+            import jieba
+            self.pre_tokenizer = lambda x: jieba.lcut(x, HMM=False)
+        except ImportError:
+            raise (ImportError("Chinese whole word mask need jieba"))
+    def _tokenize_chinese_chars(self, text):
+        """For Chinese pieces, uses jieba to segment the words and
+        adds whitespace around CJK character."""
+        output = []
+        piece = ""
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                piece += char
+            else:
+                chinese_words = self.pre_tokenizer(piece)
+                for word in chinese_words:
+                    output.append(" ")
+                    output.append(word)
+                    output.append(" ")
+                output.append(char)
+                piece = ""
+        chinese_words = self.pre_tokenizer(piece)
+        for word in chinese_words:
+            output.append(" ")
+            output.append(word)
+            output.append(" ")
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+          input = "有没有"
+          output = ["有", "##没", "##有"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr.startswith("##"):
+                        if _is_chinese_substr(substr):
+                            if substr[2:] in self.vocab:  # for Chinese substr
+                                cur_substr = substr
+                                break
+                        else:
+                            if substr in self.vocab:  # for English substr
+                                cur_substr = substr
+                                break
+                    else:
+                        if (
+                            substr in self.vocab
+                        ):  # non-substr, maybe character or whole Chinese word
+                            cur_substr = substr
+                            break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
--- a/libai/tokenizer/tokenization_gpt2.py
+++ b/libai/tokenizer/tokenization_gpt2.py
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT (BPE)."""
+import json
+import logging
+import os
+from functools import lru_cache
+from io import open
+from typing import List, Optional
+import regex as re
+from .tokenization_base import PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json"},
+    "merges_file": {"gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt"},
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "gpt2": 1024,
+}
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping
+    to whitespace/control characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode
+    characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token
+    dataset you end up needing around 5K for decent coverage. This is a significant percentage
+    of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8
+    bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class GPT2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be
+            converted to an ID and is set to be this token instead.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The end of sequence token.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_bos_token=False,
+        **kwargs,
+    ):
+        super(GPT2Tokenizer, self).__init__(
+            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
+        )
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding="utf-8").read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        # Should haved added re.IGNORECASE so BPE merges can happen for
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+        self.add_bos_token = add_bos_token
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            # Maps all our bytes to unicode strings, avoiding control tokens
+            # of the BPE (spaces in our case)
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) to a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Add special tokens to a sequence or a pair of sequence.
+        GPT2 format sentence input:
+        - single sequence: <|endoftext|> tokens_a
+        - pair of sequences: <|endoftext|> tokens_a <|endoftext|> tokens_b
+        Args:
+            token_ids_0 (List[int]): The token ids of sentence 0.
+            token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
+        Returns:
+            :obj:`List[str]`: The sequence after adding special toekens.
+        """
+        if self.add_bos_token:
+            bos = [self.bos_token_id]
+        else:
+            bos = []
+        if token_ids_1 is None:
+            return bos + token_ids_0
+        return bos + token_ids_0 + bos + token_ids_1
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        merge_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"],
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        return (vocab_file, merge_file)
--- a/libai/tokenizer/tokenization_roberta.py
+++ b/libai/tokenizer/tokenization_roberta.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoBERTa."""
+import json
+import logging
+import os
+from functools import lru_cache
+from typing import List, Optional, Tuple
+import regex as re
+from .tokenization_base import PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roberta-base": 512,
+    "roberta-large": 512,
+    "roberta-large-mnli": 512,
+}
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to
+    whitespace/control characters the bpe code barfs on.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode
+    characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token
+    dataset you end up needing around 5K for decent coverage. This is a significant percentage of
+    your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+    unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+class RobertaTokenizer(PreTrainedTokenizer):
+    """Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer,
+    using byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        bos_token (:obj:`str`, `optional`, defaults to `<s>`):
+            The beginning of sequence token.
+        eos_token (:obj:`str`, `optional`, defaults to `</s>`):
+            The end of sequence token.
+        cls_token (:obj:`str`, `optional`, defaults to `<s>`):
+            The first token of the sequence when built with special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to `<unk>`):
+            The unknown token. A token that is not in the vocabulary cannot be
+            converted to an ID and is set to be this token instead.
+        pad_token (:obj:`str`, `optional`, defaults to `<pad>`): A special token
+            used to make arrays of tokens the same size for batching purpose.
+            Will then be ignored by attention mechanisms or loss computation.
+        mask_token (:obj:`str`, `optional`, defaults to `<mask>`): A special token
+            representing a masked token (used by masked-language modeling pretraining
+            objectives, like BERT).
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_bos_token=False,
+        **kwargs,
+    ):
+        super(RobertaTokenizer, self).__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        with open(vocab_file, encoding="utf-8") as file:
+            self.encoder = json.load(file)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as file:
+            bpe_merges = file.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        )
+        self.add_bos_token = add_bos_token
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Add special tokens to a sequence or a pair of sequence.
+        RoBERTa format sentence input:
+        - single sequence: [CLS] tokens_a [SEP]
+        - pair of sequences: [CLS] tokens_a [SEP] tokens_b [SEP]
+        Args:
+            token_ids_0 (List[int]): The token ids of sentence 0.
+            token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
+        Returns:
+            :obj:`List[str]`: The sequence after adding special toekens.
+        """
+        if self.add_bos_token:
+            cls = [self.cls_token_id]
+            sep = [self.sep_token_id]
+        else:
+            cls = []
+            sep = []
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        merge_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"],
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+        return vocab_file, merge_file
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair
+        classification task. RoBERTa does not make use of token type ids, therefore
+        a list of zeros is returned.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
--- a/libai/tokenizer/tokenization_t5.py
+++ b/libai/tokenizer/tokenization_t5.py
+# coding=utf-8
+# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Google T5 (sentence piece)."""
+import logging
+import os
+import warnings
+from shutil import copyfile
+from typing import List, Optional
+import regex as re
+import sentencepiece as spm
+from .tokenization_base import PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"t5-base": "https://huggingface.co/t5-base/resolve/main/spiece.model"}
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "t5-base": 512,
+}
+class T5Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`.
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot
+            be converted to an ID and is set to be this token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (:obj:`int`, `optional`, defaults to 100):
+            Add a number of extra ids added to the end of the vocabulary for use
+            as sentinels. These tokens are accessible as "<extra_id_{%d}>" where
+            "{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed
+            from the end of the vocabulary up to beginning ("<extra_id_0>" is the
+            last token in the vocabulary like in T5 preprocessing see `here
+            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(
+        self,
+        vocab_file,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=100,
+        additional_special_tokens=None,
+        add_bos_token=False,
+        **kwargs,
+    ):
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+        elif extra_ids > 0 and additional_special_tokens is not None:
+            extra_tokens = len(
+                set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))
+            )
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens "
+                    f"({additional_special_tokens}) are privided to T5Tokenizer. "
+                    "In this case the additional_special_tokens must include the extra_ids tokens"
+                )
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        self.add_bos_token = add_bos_token
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size() + self._extra_ids
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        pieces = self.sp_model.encode(text, out_type=str)
+        return pieces
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token.startswith("<extra_id_"):
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))
+            return self.vocab_size - num - 1
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) to a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode_pieces(current_sub_tokens)
+        return out_string.strip()
+    def _add_eos_if_not_present(self, token_ids):
+        if not self.add_bos_token:
+            return token_ids
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn("This sequence already has {self.eos_token}.")
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """Add special tokens to a sequence or a pair of sequence.
+        T5 format sentence input:
+        - single sequence: tokens_a </s>
+        - pair of sequences: tokens_a </s> tokens_b </s>
+        Args:
+            token_ids_0 (List[int]): The token ids of sentence 0.
+            token_ids_1 (List[int], optional): The token ids of sentence 1. Defaults to None.
+        Returns:
+            :obj:`List[str]`: The sequence after adding special toekens.
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """Save the tokenizer vocabulary to a directory or file."""
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+            logger.info(f"Copy vocab file to {out_vocab_file}")
+        return (out_vocab_file,)
--- a/libai/utils/__init__.py
+++ b/libai/utils/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/libai/utils/__pycache__/__init__.cpython-39.pyc
+++ b/libai/utils/__pycache__/__init__.cpython-39.pyc
--- a/libai/utils/__pycache__/checkpoint.cpython-39.pyc
+++ b/libai/utils/__pycache__/checkpoint.cpython-39.pyc
--- a/libai/utils/__pycache__/distributed.cpython-39.pyc
+++ b/libai/utils/__pycache__/distributed.cpython-39.pyc
--- a/libai/utils/__pycache__/download.cpython-39.pyc
+++ b/libai/utils/__pycache__/download.cpython-39.pyc
--- a/libai/utils/__pycache__/events.cpython-39.pyc
+++ b/libai/utils/__pycache__/events.cpython-39.pyc
--- a/libai/utils/__pycache__/file_io.cpython-39.pyc
+++ b/libai/utils/__pycache__/file_io.cpython-39.pyc
--- a/libai/utils/__pycache__/file_utils.cpython-39.pyc
+++ b/libai/utils/__pycache__/file_utils.cpython-39.pyc
--- a/libai/utils/__pycache__/history_buffer.cpython-39.pyc
+++ b/libai/utils/__pycache__/history_buffer.cpython-39.pyc