Commit dd4fde1f authored by zhougaofeng's avatar zhougaofeng
Browse files

Update para_split_v2.py

parent e469df71
...@@ -140,9 +140,9 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -140,9 +140,9 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。 # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode) list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
if len(list_indice) > 0: # if len(list_indice) > 0:
if debug_able: # if debug_able:
logger.info(f"发现了列表,列表行数:{list_indice}{list_start_idx}") # logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。 # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = [] segments = []
...@@ -150,12 +150,12 @@ def __detect_list_lines(lines, new_layout_bboxes, lang): ...@@ -150,12 +150,12 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
for i in range(start, end + 1): for i in range(start, end + 1):
if i > 0: if i > 0:
if line_fea_encode[i] == 4: if line_fea_encode[i] == 4:
if debug_able: # if debug_able:
logger.info(f"列表行的第{i}行不是顶格的") # logger.info(f"列表行的第{i}行不是顶格的")
break break
else: # else:
if debug_able: # if debug_able:
logger.info(f"列表行的第{start}到第{end}行是列表") # logger.info(f"列表行的第{start}到第{end}行是列表")
return split_indices(total_lines, list_indice), list_start_idx return split_indices(total_lines, list_indice), list_start_idx
...@@ -435,8 +435,8 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, ...@@ -435,8 +435,8 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[ if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[
"type"] == BlockType.Text: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 "type"] == BlockType.Text: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if debug_able: # if debug_able:
logger.info(f"连接page {page_num} 内的list") # logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行 # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = [] may_list_lines = []
lines = next_first_para.get("lines", []) lines = next_first_para.get("lines", [])
...@@ -467,8 +467,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b ...@@ -467,8 +467,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text: if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
return False return False
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if debug_able: # if debug_able:
logger.info(f"连接page {page_num} 内的list") # logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行 # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = [] may_list_lines = []
next_page_first_para = next_page_paras[0][0] next_page_first_para = next_page_paras[0][0]
...@@ -680,8 +680,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang): ...@@ -680,8 +680,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']]) first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
if "Table" in first_line_text or "Figure" in first_line_text: if "Table" in first_line_text or "Figure" in first_line_text:
pass pass
if debug_able: # if debug_able:
logger.info(line_hi.std()) # logger.info(line_hi.std())
if line_hi.std() < 2: if line_hi.std() < 2:
"""行高度相同,那么判断是否居中""" """行高度相同,那么判断是否居中"""
...@@ -693,8 +693,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang): ...@@ -693,8 +693,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
and not all([x1 == layout_box[2] for x1 in all_right_x1]): and not all([x1 == layout_box[2] for x1 in all_right_x1]):
merge_para = [block["lines"][0] for block in layout_para[start:end + 1]] merge_para = [block["lines"][0] for block in layout_para[start:end + 1]]
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']]) para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
if debug_able: # if debug_able:
logger.info(para_text) # logger.info(para_text)
layout_para[start]["lines"] = merge_para layout_para[start]["lines"] = merge_para
for i_para in range(start + 1, end + 1): for i_para in range(start + 1, end + 1):
layout_para[i_para]["lines"] = [] layout_para[i_para]["lines"] = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment