Commit dd4fde1f authored by zhougaofeng's avatar zhougaofeng
Browse files

Update para_split_v2.py

parent e469df71
......@@ -140,9 +140,9 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
if len(list_indice) > 0:
if debug_able:
logger.info(f"发现了列表,列表行数:{list_indice}{list_start_idx}")
# if len(list_indice) > 0:
# if debug_able:
# logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
# TODO check一下这个特列表里缩进的行左侧是不是对齐的。
segments = []
......@@ -150,12 +150,12 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
for i in range(start, end + 1):
if i > 0:
if line_fea_encode[i] == 4:
if debug_able:
logger.info(f"列表行的第{i}行不是顶格的")
# if debug_able:
# logger.info(f"列表行的第{i}行不是顶格的")
break
else:
if debug_able:
logger.info(f"列表行的第{start}到第{end}行是列表")
# else:
# if debug_able:
# logger.info(f"列表行的第{start}到第{end}行是列表")
return split_indices(total_lines, list_indice), list_start_idx
......@@ -435,8 +435,8 @@ def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[
"type"] == BlockType.Text: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if debug_able:
logger.info(f"连接page {page_num} 内的list")
# if debug_able:
# logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = []
lines = next_first_para.get("lines", [])
......@@ -467,8 +467,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
return False
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
if debug_able:
logger.info(f"连接page {page_num} 内的list")
# if debug_able:
# logger.info(f"连接page {page_num} 内的list")
# 向layout_paras[i] 寻找开头具有相同缩进的连续的行
may_list_lines = []
next_page_first_para = next_page_paras[0][0]
......@@ -680,8 +680,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
if "Table" in first_line_text or "Figure" in first_line_text:
pass
if debug_able:
logger.info(line_hi.std())
# if debug_able:
# logger.info(line_hi.std())
if line_hi.std() < 2:
"""行高度相同,那么判断是否居中"""
......@@ -693,8 +693,8 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
and not all([x1 == layout_box[2] for x1 in all_right_x1]):
merge_para = [block["lines"][0] for block in layout_para[start:end + 1]]
para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
if debug_able:
logger.info(para_text)
# if debug_able:
# logger.info(para_text)
layout_para[start]["lines"] = merge_para
for i_para in range(start + 1, end + 1):
layout_para[i_para]["lines"] = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment