# Copyright 2024 Bytedance Ltd. and/or its affiliates # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Copied from https://github.com/dariush-bahrami/character-tokenizer/blob/master/charactertokenizer/core.py CharacterTokenzier for Hugging Face Transformers. This is heavily inspired from CanineTokenizer in transformers package. """ import json import os from pathlib import Path from typing import Optional, Sequence from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer class CharTokenizer(PreTrainedTokenizer): def __init__(self, characters: Sequence[str], model_max_length: int, chat_template, **kwargs): """Character tokenizer for Hugging Face transformers. Args: characters (Sequence[str]): List of desired characters. Any character which is not included in this list will be replaced by a special token called [UNK] with id=6. Following are list of all of the special tokens with their corresponding ids: "[CLS]": 0 "[SEP]": 1 "[BOS]": 2 "[MASK]": 3 "[PAD]": 4 "[RESERVED]": 5 "[UNK]": 6 an id (starting at 7) will be assigned to each character. model_max_length (int): Model maximum sequence length. """ eos_token_str = "E" sep_token_str = "S" pad_token_str = "P" unk_token_str = "U" self.characters = characters self.model_max_length = model_max_length eos_token = AddedToken(eos_token_str, lstrip=False, rstrip=False) sep_token = AddedToken(sep_token_str, lstrip=False, rstrip=False) pad_token = AddedToken(pad_token_str, lstrip=False, rstrip=False) unk_token = AddedToken(unk_token_str, lstrip=False, rstrip=False) self._vocab_str_to_int = { sep_token_str: 0, eos_token_str: 1, pad_token_str: 2, unk_token_str: 3, **{ch: i + 4 for i, ch in enumerate(characters)}, } self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()} super().__init__( eos_token=eos_token, sep_token=sep_token, pad_token=pad_token, unk_token=unk_token, add_prefix_space=False, model_max_length=model_max_length, **kwargs, ) self.chat_template = chat_template @property def vocab_size(self) -> int: return len(self._vocab_str_to_int) def get_vocab(self): return self._vocab_str_to_int def _tokenize(self, text: str) -> list[str]: return list(text) def _convert_token_to_id(self, token: str) -> int: return self._vocab_str_to_int.get(token, self._vocab_str_to_int["U"]) def _convert_id_to_token(self, index: int) -> str: return self._vocab_int_to_str[index] def convert_tokens_to_string(self, tokens): return "".join(tokens) def build_inputs_with_special_tokens( self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None ) -> list[int]: sep = [self.sep_token_id] cls = [self.cls_token_id] result = cls + token_ids_0 + sep if token_ids_1 is not None: result += token_ids_1 + sep return result def get_special_tokens_mask( self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False, ) -> list[int]: if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True, ) result = [1] + ([0] * len(token_ids_0)) + [1] if token_ids_1 is not None: result += ([0] * len(token_ids_1)) + [1] return result def get_config(self) -> dict: return { "char_ords": [ord(ch) for ch in self.characters], "model_max_length": self.model_max_length, "chat_template": self.chat_template, } @classmethod def from_config(cls, config: dict): cfg = {} cfg["characters"] = [chr(i) for i in config["char_ords"]] cfg["model_max_length"] = config["model_max_length"] cfg["chat_template"] = config["chat_template"] return cls(**cfg) def save_pretrained(self, save_directory: str | os.PathLike, **kwargs): cfg_file = Path(save_directory) / "tokenizer_config.json" cfg = self.get_config() with open(cfg_file, "w") as f: json.dump(cfg, f, indent=4) @classmethod def from_pretrained(cls, save_directory: str | os.PathLike, **kwargs): cfg_file = Path(save_directory) / "tokenizer_config.json" with open(cfg_file) as f: cfg = json.load(f) return cls.from_config(cfg)