tokenization.h

#ifndef CUBERT_TOKENIZATION_H
#define CUBERT_TOKENIZATION_H

#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>

namespace cuBERT {

void load_vocab(const char* vocab_file, std::unordered_map<std::string, uint64_t>* vocab);

/**
 * Checks whether `chars` is a whitespace character.
 * @param c
 * @return
 */
bool _is_whitespace(int c);

/**
 * Checks whether `chars` is a control character.
 * @param c
 * @return
 */
bool _is_control(int c);

/**
 * Checks whether `chars` is a punctuation character.
 * @param cp
 * @return
 */
bool _is_punctuation(int cp);

/**
 * Runs basic tokenization (punctuation splitting, lower casing, etc.).
 */
class BasicTokenizer
{
    public:
    /**
     * Constructs a BasicTokenizer.
     * @param do_lower_case Whether to lower case the input.
     */
    explicit BasicTokenizer(bool do_lower_case = true) : do_lower_case(do_lower_case) {}

    BasicTokenizer(const BasicTokenizer& other) = delete;

    virtual ~BasicTokenizer() = default;

    /**
     * Tokenizes a piece of text.
     *
     * to_lower
     * _run_strip_accents Strips accents from a piece of text.
     * _clean_text Performs invalid character removal and whitespace cleanup on
     * text. _tokenize_chinese_chars Adds whitespace around any CJK character.
     * _run_split_on_punc Splits punctuation on a piece of text.
     * whitespace_tokenize Runs basic whitespace cleaning and splitting on a piece
     * of text.
     *
     * @param text
     * @param output_tokens
     */
    void tokenize(const char* text, std::vector<std::string>* output_tokens, size_t max_length);

    private:
    const bool do_lower_case;

    /**
     * Checks whether CP is the codepoint of a CJK character.
     * @param cp
     * @return
     */
    inline static bool _is_chinese_char(int cp);
};

/**
 * Runs WordPiece tokenziation.
 */
class WordpieceTokenizer
{
    public:
    explicit WordpieceTokenizer(std::unordered_map<std::string, uint64_t>* vocab,
                                std::string unk_token        = "[UNK]",
                                int max_input_chars_per_word = 200)
        : vocab(vocab), unk_token(unk_token), max_input_chars_per_word(max_input_chars_per_word)
    {
    }

    WordpieceTokenizer(const WordpieceTokenizer& other) = delete;

    virtual ~WordpieceTokenizer() = default;

    /**
     * Tokenizes a piece of text into its word pieces.
     *
     * This uses a greedy longest-match-first algorithm to perform tokenization
     * using the given vocabulary.
     *
     * For example:
     *   input = "unaffable"
     *   output = ["un", "##aff", "##able"]
     *
     * @param text A single token or whitespace separated tokens. This should have
     * already been passed through `BasicTokenizer.
     * @param output_tokens A list of wordpiece tokens.
     */
    void tokenize(const std::string& text, std::vector<std::string>* output_tokens);

    private:
    const std::unordered_map<std::string, uint64_t>* vocab;
    const std::string unk_token;
    const int max_input_chars_per_word;
};

/**
 * Runs end-to-end tokenziation.
 */
class FullTokenizer
{
    public:
    FullTokenizer(const char* vocab_file, bool do_lower_case = true)
    {
        vocab = new std::unordered_map<std::string, uint64_t>();
        load_vocab(vocab_file, vocab);
        basic_tokenizer     = new BasicTokenizer(do_lower_case);
        wordpiece_tokenizer = new WordpieceTokenizer(vocab);
    }

    ~FullTokenizer()
    {
        if(wordpiece_tokenizer != NULL)
        {
            wordpiece_tokenizer = NULL;
        }
        delete wordpiece_tokenizer;

        if(basic_tokenizer != NULL)
        {
            basic_tokenizer = NULL;
        }
        delete basic_tokenizer;

        if(vocab != NULL)
        {
            vocab = NULL;
        }
        delete vocab;
    }

    void tokenize(const char* text, std::vector<std::string>* output_tokens, size_t max_length);

    inline uint64_t convert_token_to_id(const std::string& token)
    {
        auto item = vocab->find(token);
        if(item == vocab->end())
        {
            std::cerr << "vocab missing key: " << token << std::endl;
            return 0;
        }
        else
        {
            return item->second;
        }
    }

    void convert_tokens_to_ids(const std::vector<std::string>& tokens, uint64_t* ids);

    private:
    std::unordered_map<std::string, uint64_t>* vocab;
    BasicTokenizer* basic_tokenizer;
    WordpieceTokenizer* wordpiece_tokenizer;
};

} // namespace cuBERT

#endif // CUBERT_TOKENIZATION_H