"src/vscode:/vscode.git/clone" did not exist on "a9768cb38fc21480bc4079c7d02bd611290da71d"
Unverified Commit d0878333 authored by Raul Puri's avatar Raul Puri Committed by GitHub
Browse files

added missing presplit_sentences_json.py

parent 66719e97
"""
Usage:
python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
"""
import sys
import json
import nltk
nltk.download('punkt')
input_file = sys.argv[1]
output_file = sys.argv[2]
line_seperator = "\n"
with open(input_file, 'r') as ifile:
with open(output_file, "w") as ofile:
for doc in ifile.readlines():
parsed = json.loads(doc)
sent_list = []
for line in parsed['text'].split('\n'):
if line != '\n':
sent_list.extend(nltk.tokenize.sent_tokenize(line))
parsed['text'] = line_seperator.join(sent_list)
ofile.write(json.dumps(parsed)+'\n')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment