preprocess_wsc.py 613 Bytes
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
6
import re

def doc_to_text(x):

    def _mark_span(text, span_str, span_idx, mark):
        pattern_tmpl = r'^((?:\S+\s){N})(W)'
lintangsutawika's avatar
lintangsutawika committed
7
8
9
        pattern = re.sub('N', str(span_idx), pattern_tmpl)
        pattern = re.sub('W', span_str, pattern)
        return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text)
lintangsutawika's avatar
lintangsutawika committed
10
11
12
13
14
15
16
17

    text = x['text']
    text = _mark_span(text, x['span1_text'], x['span1_index'], '*')
    # Compensate for 2 added "words" added in previous step.
    span2_index = x['span2_index'] + 2 * (x['span1_index'] < x['span2_index'])
    text = _mark_span(text, x['span2_text'], span2_index, '#')

    return text