preprocess_wsc.py 614 Bytes
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
import re


lintangsutawika's avatar
lintangsutawika committed
4
def doc_to_text(x):
lintangsutawika's avatar
lintangsutawika committed
5
    def _mark_span(text, span_str, span_idx, mark):
lintangsutawika's avatar
lintangsutawika committed
6
7
8
9
        pattern_tmpl = r"^((?:\S+\s){N})(W)"
        pattern = re.sub("N", str(span_idx), pattern_tmpl)
        pattern = re.sub("W", span_str, pattern)
        return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
lintangsutawika's avatar
lintangsutawika committed
10

lintangsutawika's avatar
lintangsutawika committed
11
12
    text = x["text"]
    text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
lintangsutawika's avatar
lintangsutawika committed
13
    # Compensate for 2 added "words" added in previous step.
lintangsutawika's avatar
lintangsutawika committed
14
15
    span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
    text = _mark_span(text, x["span2_text"], span2_index, "#")
lintangsutawika's avatar
lintangsutawika committed
16

lintangsutawika's avatar
lintangsutawika committed
17
    return text