sentence_splitter.py 545 Bytes
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import re


try:
    import nltk

    NLTK_AVAILABLE = True
except (ImportError, ModuleNotFoundError):
    NLTK_AVAILABLE = False

if NLTK_AVAILABLE:
    try:
        nltk.download("punkt", quiet=True)
    except FileExistsError:  # multiprocessing race condition
        pass


def add_newline_to_end_of_each_sentence(x: str) -> str:
    re.sub("<n>", "", x)  # remove pegasus newline char
    assert NLTK_AVAILABLE, "nltk must be installed to separate newlines betwee sentences. (pip install nltk)"
    return "\n".join(nltk.sent_tokenize(x))