presplit_sentences_json.py 645 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
"""
Usage:
python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
"""

import sys
import json

import nltk

nltk.download('punkt')

input_file = sys.argv[1]
output_file = sys.argv[2]

line_seperator = "\n"

with open(input_file, 'r') as ifile:
  with open(output_file, "w") as ofile:
    for doc in ifile.readlines():
      parsed = json.loads(doc)
      sent_list = []
      for line in parsed['text'].split('\n'):
          if line != '\n':
              sent_list.extend(nltk.tokenize.sent_tokenize(line))
      parsed['text'] = line_seperator.join(sent_list)
      ofile.write(json.dumps(parsed)+'\n')