preprocess_data.py 998 Bytes
Newer Older
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import json
import nltk
nltk.download('punkt')

from bert_tokenization import FullTokenizer


def document_generator_provider(input_file):
    with open(input_file, 'r') as ifile:
        for document in ifile:
            data = json.loads(document)
            text = data['text']
            sentences = []
            for line in text.split('\n'):
                if line != '\n':
                    sentences.extend(nltk.tokenize.sent_tokenize(line))
            yield sentences


if __name__ == '__main__':

    print('processing data ...')

    input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
    vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'

    tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
    document_generator = document_generator_provider(input_file)
    for sentences in document_generator:
        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)
            print(sentence)
            print(tokens)