presplit_sentences_json.py 695 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
"""
Usage:
python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
"""

import sys
import json

import nltk

nltk.download('punkt')

input_file = sys.argv[1]
output_file = sys.argv[2]

line_seperator = "\n"

with open(input_file, 'r') as ifile:
Neel Kant's avatar
Neel Kant committed
19
20
21
22
23
24
25
26
27
    with open(output_file, "w") as ofile:
        for doc in ifile.readlines():
            parsed = json.loads(doc)
            sent_list = []
            for line in parsed['text'].split('\n'):
                if line != '\n':
                    sent_list.extend(nltk.tokenize.sent_tokenize(line))
            parsed['text'] = line_seperator.join(sent_list)
            ofile.write(json.dumps(parsed) + '\n')