Commit 68851ae0 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update para_split.py

parent f3b09a0b
...@@ -98,8 +98,8 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -98,8 +98,8 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。 # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice, list_start_idx = find_repeating_patterns(line_fea_encode) list_indice, list_start_idx = find_repeating_patterns(line_fea_encode)
if len(list_indice)>0: # if len(list_indice)>0:
logger.info(f"发现了列表,列表行数:{list_indice}{list_start_idx}") # logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。 # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = [] segments = []
...@@ -107,10 +107,10 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -107,10 +107,10 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
for i in range(start, end+1): for i in range(start, end+1):
if i>0: if i>0:
if line_fea_encode[i] == 4: if line_fea_encode[i] == 4:
logger.info(f"列表行的第{i}行不是顶格的") # logger.info(f"列表行的第{i}行不是顶格的")
break break
else: # else:
logger.info(f"列表行的第{start}到第{end}行是列表") # logger.info(f"列表行的第{start}到第{end}行是列表")
return split_indices(total_lines, list_indice), list_start_idx return split_indices(total_lines, list_indice), list_start_idx
...@@ -350,7 +350,7 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, ...@@ -350,7 +350,7 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
next_first_para = next_paras[0] next_first_para = next_paras[0]
if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger.info(f"连接page {page_num} 内的list") # logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行 # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = [] may_list_lines = []
for j in range(len(next_paras)): for j in range(len(next_paras)):
...@@ -379,7 +379,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -379,7 +379,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
return False return False
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger.info(f"连接page {page_num} 内的list") # logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行 # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = [] may_list_lines = []
for j in range(len(next_page_paras[0])): for j in range(len(next_page_paras[0])):
...@@ -431,7 +431,7 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang): ...@@ -431,7 +431,7 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
pre_last_line = layout_paras[i-1][-1][-1] pre_last_line = layout_paras[i-1][-1][-1]
next_first_line = layout_paras[i][0][0] next_first_line = layout_paras[i][0][0]
except Exception as e: except Exception as e:
logger.error(f"page layout {i} has no line") # logger.error(f"page layout {i} has no line")
continue continue
pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
pre_last_line_type = pre_last_line['spans'][-1]['type'] pre_last_line_type = pre_last_line['spans'][-1]['type']
...@@ -547,7 +547,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb ...@@ -547,7 +547,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
if "Table" in first_line_text or "Figure" in first_line_text: if "Table" in first_line_text or "Figure" in first_line_text:
pass pass
if debug_mode: if debug_mode:
logger.debug(line_hi.std()) # logger.debug(line_hi.std())
if line_hi.std()<2: if line_hi.std()<2:
"""行高度相同,那么判断是否居中""" """行高度相同,那么判断是否居中"""
...@@ -559,8 +559,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb ...@@ -559,8 +559,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
and not all([x1==layout_box[2] for x1 in all_right_x1]): and not all([x1==layout_box[2] for x1 in all_right_x1]):
merge_para = [l[0] for l in layout_para[start:end+1]] merge_para = [l[0] for l in layout_para[start:end+1]]
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']]) para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
if debug_mode: # if debug_mode:
logger.debug(para_text) # logger.debug(para_text)
layout_para[start:end+1] = [merge_para] layout_para[start:end+1] = [merge_para]
index_offset -= end-start index_offset -= end-start
...@@ -624,14 +624,14 @@ def para_split(pdf_info_dict, debug_mode, lang="en"): ...@@ -624,14 +624,14 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
next_page_layout_bbox = new_layout_of_pages[page_num] next_page_layout_bbox = new_layout_of_pages[page_num]
is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang) is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
if debug_mode: # if debug_mode:
if is_conn: # if is_conn:
logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落") # logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
#
is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang) is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang)
if debug_mode: # if debug_mode:
if is_list_conn: # if is_list_conn:
logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落") # logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接 """接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。 1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment