data_processor.py

"""
Prepare input data for Google's BERT Model.

Contains some functions from tensor2tensor library: https://github.com/tensorflow/tensor2tensor

"""
from typing import NamedTuple, List, Union, Tuple

TokenizedSentence = List[str]
TokenizedInput = Union[Tuple[TokenizedSentence, TokenizedSentence], TokenizedSentence]

class DataProcessor():
    def __init__(self, vocab_path):
        self.encoder_file_path = encoder_file_path
        self.token_indexer = json.load(open(vocab_path))


    def tokenize(text):
    """Encode a unicode string as a list of tokens.

    Args:
        text: a unicode string
    Returns:
        a list of tokens as Unicode strings
    """
    if not text:
        return []
    ret = []
    token_start = 0
    # Classify each character in the input string
    is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
    for pos in range(1, len(text)):
        if is_alnum[pos] != is_alnum[pos - 1]:
        token = text[token_start:pos]
        if token != u" " or token_start == 0:
            ret.append(token)
        token_start = pos
    final_token = text[token_start:]
    ret.append(final_token)
    return ret


    def detokenize(tokens):
    """Decode a list of tokens to a unicode string.

    Args:
        tokens: a list of Unicode strings
    Returns:
        a unicode string
    """
    token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
    ret = []
    for i, token in enumerate(tokens):
        if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
        ret.append(u" ")
        ret.append(token)
    return "".join(ret)

    def encode(input_sentences: List[TokenizedInput]) -> np.array:
        """ Prepare a torch.Tensor of inputs for BERT model from a string.

        Args:
            input_sentences: list of
                - pairs of tokenized sentences (sentence_A, sentence_B) or
                - tokenized sentences (will be considered as sentence_A only)

        Return:
            Numpy array of formated inputs for BERT model
        """
        batch_size = sum(min(len(x), n_perso_permute) for x in X1)
        input_mask = np.zeros((n_batch, n_cands, n_ctx), dtype=np.float32)
        input_array = np.zeros((n_batch, n_cands, n_ctx, 3), dtype=np.int32)
        i = 0
        for tokenized_input in input_sentences:
            x1j, lxj, lperso, lhisto, dialog_embed = format_transformer_input(x1, x2, xcand_j, text_encoder,
                                                                                dialog_embed_mode, max_len=max_len,
                                                                                add_start_stop=True)
            lmj = len(xcand_j[:max_len]) + 1
            xmb[i, j, :lxj, 0] = x1j
            if dialog_embed_mode == 1 or dialog_embed_mode == 2:
                xmb[i, j, :lxj, 2] = dialog_embed
            mmb[i, j, :lxj] = 1
            if fix_lm_index: # Take one before so we don't predict from classify token...
                mmb_eval[i, j, (lxj-lmj-1):lxj-1] = 1 # This one only mask the response so we get the perplexity on the response only
            else:
                mmb_eval[i, j, (lxj-lmj):lxj] = 1 # This one only mask the response so we get the perplexity on the response only
            xmb[i, j, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx)
            i += 1
        return input_array, input_mask