Unverified Commit f70246d6 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1058 from myhloli/dev

refactor(para): improve line stop flag and remove unused debug mode
parents 93208f44 5d6cbcb1
......@@ -352,7 +352,7 @@ def __para_merge_page(blocks):
continue
def para_split(pdf_info_dict, debug_mode=False):
def para_split(pdf_info_dict):
all_blocks = []
for page_num, page in pdf_info_dict.items():
blocks = copy.deepcopy(page['preproc_blocks'])
......
......@@ -114,7 +114,7 @@ def chars_to_content(span):
del span['chars']
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',')
LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',)
def fill_char_in_spans(spans, all_chars):
for char in all_chars:
......@@ -830,7 +830,7 @@ def pdf_parse_union(
pdf_info_dict[f'page_{page_id}'] = page_info
"""分段"""
para_split(pdf_info_dict, debug_mode=debug_mode)
para_split(pdf_info_dict)
"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment