# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. import json from abc import ABC, abstractmethod from collections import OrderedDict from typing import Any import numpy class MegatronTokenizer(ABC): """Abstract class for tokenizer Absent a config or class-specific tracking of which objects are uniquely identifying, we must include all key word arguments as unique identifiers Args: tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes tokenizer_options (Dict[str, Any]): All tokenizer options """ def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any): self.unique_identifiers = OrderedDict() self.unique_identifiers["class"] = type(self).__name__ self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths) for option in tokenizer_options: self.unique_identifiers[option] = str(tokenizer_options[option]) self.unique_description = json.dumps(self.unique_identifiers, indent=4) super().__init__() @abstractmethod def tokenize(self, text: str) -> numpy.ndarray: """Convert text to embedding ids Args: text (str): The text to convert Returns: numpy.ndarray: The converted embedding ids """ pass def detokenize(self, ids: numpy.ndarray) -> str: """Convert embedding ids to text Args: ids (numpy.ndarray): The ids to convert Returns: str: The converted text Raises: NotImplementedError: Non-abstract, optional method """ raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__)) def offsets(self, ids: list[int], text: str) -> list[int]: """Convert embedding ids to text offsets Args: ids (list[int]): The ids to convert text (str): The text to convert Returns: list[int]: The converted offsets Raises: NotImplementedError: Non-abstract, optional method """ raise NotImplementedError("{} has no method 'offsets'".format(type(self).__name__)) @property @abstractmethod def vocab(self): """Dictionary from vocab text token to id token""" pass @property @abstractmethod def inv_vocab(self): """Dictionary from vocab id token to text token""" pass @property @abstractmethod def vocab_size(self): """The vocabulary size""" pass @property def cls(self): """The CLS token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__)) @property def sep(self): """The SEP token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__)) @property def pad(self): """The PAD token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__)) @property def eod(self): """The EOD token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__)) @property def bos(self): """The BOS token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__)) @property def eos(self): """The EOS token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__)) @property def mask(self): """The MASK token id Raises: NotImplementedError: Non-abstract, optional attribute """ raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))