spiece_tokenizer.py 629 Bytes
Newer Older
1
2
import os

3
class SPieceTokenizer:
4
5
    @staticmethod
    def from_pretrained(path):
6
        return SPieceTokenizer(path)
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

    def __init__(self, tokenizer_path):
        import sentencepiece
        self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path)
        self.end = self.tokenizer.eos_id()

    def get_vocab(self):
        out = {}
        for i in range(self.tokenizer.get_piece_size()):
            out[self.tokenizer.id_to_piece(i)] = i
        return out

    def __call__(self, string):
        out = self.tokenizer.encode(string)
        out += [self.end]
        return {"input_ids": out}