tokenizer.py 5.65 KB
Newer Older
Pan,Huiwen's avatar
Pan,Huiwen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Copyright (c) 2017 Elad Hoffer
# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

huchen's avatar
huchen committed
22
23
24
25
import logging
from collections import defaultdict
from functools import partial

Pan,Huiwen's avatar
Pan,Huiwen committed
26
27
28
import torch
import subword_nmt.apply_bpe
import sacremoses
huchen's avatar
huchen committed
29
30
31
32
33
34
35
import seq2seq.data.config as config


class Tokenizer:
    """
    Tokenizer class.
    """
Pan,Huiwen's avatar
Pan,Huiwen committed
36
37
    def __init__(self, vocab_fname=None, bpe_fname=None, lang=None, pad=1,
                 separator='@@'):
huchen's avatar
huchen committed
38
39
40
41
        """
        Constructor for the Tokenizer class.

        :param vocab_fname: path to the file with vocabulary
Pan,Huiwen's avatar
Pan,Huiwen committed
42
        :param bpe_fname: path to the file with bpe codes
huchen's avatar
huchen committed
43
44
45
        :param pad: pads vocabulary to a multiple of 'pad' tokens
        :param separator: tokenization separator
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
46
47
48
49
50
51
52
        self.separator = separator
        self.lang = lang

        if bpe_fname:
            with open(bpe_fname, 'r') as bpe_codes:
                self.bpe = subword_nmt.apply_bpe.BPE(bpe_codes)

huchen's avatar
huchen committed
53
        if vocab_fname:
Pan,Huiwen's avatar
Pan,Huiwen committed
54
55
56
57
            self.build_vocabulary(vocab_fname, pad)

        if lang:
            self.init_moses(lang)
huchen's avatar
huchen committed
58

Pan,Huiwen's avatar
Pan,Huiwen committed
59
60
61
    def init_moses(self, lang):
        self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
        self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])
huchen's avatar
huchen committed
62

Pan,Huiwen's avatar
Pan,Huiwen committed
63
64
65
66
67
68
69
    def build_vocabulary(self, vocab_fname, pad):
        logging.info(f'Building vocabulary from {vocab_fname}')
        vocab = [config.PAD_TOKEN, config.UNK_TOKEN,
                 config.BOS_TOKEN, config.EOS_TOKEN]
        with open(vocab_fname) as vfile:
            for line in vfile:
                vocab.append(line.strip())
huchen's avatar
huchen committed
70

Pan,Huiwen's avatar
Pan,Huiwen committed
71
        self.pad_vocabulary(vocab, pad)
huchen's avatar
huchen committed
72

Pan,Huiwen's avatar
Pan,Huiwen committed
73
74
        self.vocab_size = len(vocab)
        logging.info(f'Size of vocabulary: {self.vocab_size}')
huchen's avatar
huchen committed
75

Pan,Huiwen's avatar
Pan,Huiwen committed
76
77
78
        self.tok2idx = defaultdict(partial(int, config.UNK))
        for idx, token in enumerate(vocab):
            self.tok2idx[token] = idx
huchen's avatar
huchen committed
79

Pan,Huiwen's avatar
Pan,Huiwen committed
80
81
82
        self.idx2tok = {}
        for key, value in self.tok2idx.items():
            self.idx2tok[value] = key
huchen's avatar
huchen committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

    def pad_vocabulary(self, vocab, pad):
        """
        Pads vocabulary to a multiple of 'pad' tokens.

        :param vocab: list with vocabulary
        :param pad: integer
        """
        vocab_size = len(vocab)
        padded_vocab_size = (vocab_size + pad - 1) // pad * pad
        for i in range(0, padded_vocab_size - vocab_size):
            token = f'madeupword{i:04d}'
            vocab.append(token)
        assert len(vocab) % pad == 0

    def get_state(self):
        logging.info(f'Saving state of the tokenizer')
        state = {
Pan,Huiwen's avatar
Pan,Huiwen committed
101
            'lang': self.lang,
huchen's avatar
huchen committed
102
103
            'separator': self.separator,
            'vocab_size': self.vocab_size,
Pan,Huiwen's avatar
Pan,Huiwen committed
104
            'bpe': self.bpe,
huchen's avatar
huchen committed
105
106
107
108
109
110
111
            'tok2idx': self.tok2idx,
            'idx2tok': self.idx2tok,
        }
        return state

    def set_state(self, state):
        logging.info(f'Restoring state of the tokenizer')
Pan,Huiwen's avatar
Pan,Huiwen committed
112
        self.lang = state['lang']
huchen's avatar
huchen committed
113
114
        self.separator = state['separator']
        self.vocab_size = state['vocab_size']
Pan,Huiwen's avatar
Pan,Huiwen committed
115
        self.bpe = state['bpe']
huchen's avatar
huchen committed
116
117
118
        self.tok2idx = state['tok2idx']
        self.idx2tok = state['idx2tok']

Pan,Huiwen's avatar
Pan,Huiwen committed
119
120
        self.init_moses(self.lang)

huchen's avatar
huchen committed
121
122
123
124
125
126
127
128
129
130
131
132
133
    def segment(self, line):
        """
        Tokenizes single sentence and adds special BOS and EOS tokens.

        :param line: sentence

        returns: list representing tokenized sentence
        """
        line = line.strip().split()
        entry = [self.tok2idx[i] for i in line]
        entry = [config.BOS] + entry + [config.EOS]
        return entry

Pan,Huiwen's avatar
Pan,Huiwen committed
134
135
136
137
138
139
140
141
    def tokenize(self, line):
        tokenized = self.moses_tokenizer.tokenize(line, return_str=True)
        bpe = self.bpe.process_line(tokenized)
        segmented = self.segment(bpe)
        tensor = torch.tensor(segmented)
        return tensor

    def detokenize_bpe(self, inp, delim=' '):
huchen's avatar
huchen committed
142
143
144
145
146
147
148
149
        """
        Detokenizes single sentence and removes token separator characters.

        :param inputs: sequence of tokens
        :param delim: tokenization delimiter

        returns: string representing detokenized sentence
        """
Pan,Huiwen's avatar
Pan,Huiwen committed
150
        detok = delim.join([self.idx2tok[idx] for idx in inp])
huchen's avatar
huchen committed
151
152
153
154
155
156
157
158
        detok = detok.replace(self.separator + ' ', '')
        detok = detok.replace(self.separator, '')

        detok = detok.replace(config.BOS_TOKEN, '')
        detok = detok.replace(config.EOS_TOKEN, '')
        detok = detok.replace(config.PAD_TOKEN, '')
        detok = detok.strip()
        return detok
Pan,Huiwen's avatar
Pan,Huiwen committed
159
160
161
162
163
164
165
166
167

    def detokenize_moses(self, inp):
        output = self.moses_detokenizer.detokenize(inp.split())
        return output

    def detokenize(self, inp):
        detok_bpe = self.detokenize_bpe(inp)
        output = self.detokenize_moses(detok_bpe)
        return output