preprocess_wsc.py 613 Bytes
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import re

def doc_to_text(x):

    def _mark_span(text, span_str, span_idx, mark):
        pattern_tmpl = r'^((?:\S+\s){N})(W)'
        pattern = re.sub(pattern_tmpl, 'N', str(span_idx))
        pattern = re.sub(pattern, 'W', span_str)
        return re.sub(text, pattern, r'\1{0} \2 {0}'.format(mark))

    text = x['text']
    text = _mark_span(text, x['span1_text'], x['span1_index'], '*')
    # Compensate for 2 added "words" added in previous step.
    span2_index = x['span2_index'] + 2 * (x['span1_index'] < x['span2_index'])
    text = _mark_span(text, x['span2_text'], span2_index, '#')

    return text