Commit 68851ae0 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update para_split.py

parent f3b09a0b
......@@ -98,8 +98,8 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice, list_start_idx = find_repeating_patterns(line_fea_encode)
if len(list_indice)>0:
logger.info(f"发现了列表,列表行数:{list_indice}{list_start_idx}")
# if len(list_indice)>0:
# logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = []
......@@ -107,10 +107,10 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
for i in range(start, end+1):
if i>0:
if line_fea_encode[i] == 4:
logger.info(f"列表行的第{i}行不是顶格的")
# logger.info(f"列表行的第{i}行不是顶格的")
break
else:
logger.info(f"列表行的第{start}到第{end}行是列表")
# else:
# logger.info(f"列表行的第{start}到第{end}行是列表")
return split_indices(total_lines, list_indice), list_start_idx
......@@ -350,7 +350,7 @@ def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info,
next_first_para = next_paras[0]
if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger.info(f"连接page {page_num} 内的list")
# logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = []
for j in range(len(next_paras)):
......@@ -379,7 +379,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
return False
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
logger.info(f"连接page {page_num} 内的list")
# logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = []
for j in range(len(next_page_paras[0])):
......@@ -431,7 +431,7 @@ def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
pre_last_line = layout_paras[i-1][-1][-1]
next_first_line = layout_paras[i][0][0]
except Exception as e:
logger.error(f"page layout {i} has no line")
# logger.error(f"page layout {i} has no line")
continue
pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
pre_last_line_type = pre_last_line['spans'][-1]['type']
......@@ -547,7 +547,7 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
if "Table" in first_line_text or "Figure" in first_line_text:
pass
if debug_mode:
logger.debug(line_hi.std())
# logger.debug(line_hi.std())
if line_hi.std()<2:
"""行高度相同,那么判断是否居中"""
......@@ -559,8 +559,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, deb
and not all([x1==layout_box[2] for x1 in all_right_x1]):
merge_para = [l[0] for l in layout_para[start:end+1]]
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
if debug_mode:
logger.debug(para_text)
# if debug_mode:
# logger.debug(para_text)
layout_para[start:end+1] = [merge_para]
index_offset -= end-start
......@@ -624,14 +624,14 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
next_page_layout_bbox = new_layout_of_pages[page_num]
is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
if debug_mode:
if is_conn:
logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
# if debug_mode:
# if is_conn:
# logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
#
is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang)
if debug_mode:
if is_list_conn:
logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
# if debug_mode:
# if is_list_conn:
# logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
"""接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接
1. 正文中有时出现一个行顶格,接下来几行缩进的情况。
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment