Commit cfc78406 authored by myhloli's avatar myhloli
Browse files

fix: handle empty lines and spans in pipeline_middle_json_mkcontent.py

parent 7ae4f80d
...@@ -34,6 +34,8 @@ def make_blocks_to_markdown(paras_of_layout, ...@@ -34,6 +34,8 @@ def make_blocks_to_markdown(paras_of_layout,
title_level = get_title_level(para_block) title_level = get_title_level(para_block)
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}' para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
elif para_type == BlockType.INTERLINE_EQUATION: elif para_type == BlockType.INTERLINE_EQUATION:
if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
continue
if para_block['lines'][0]['spans'][0].get('content', ''): if para_block['lines'][0]['spans'][0].get('content', ''):
para_text = merge_para_with_text(para_block) para_text = merge_para_with_text(para_block)
else: else:
...@@ -201,6 +203,8 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx): ...@@ -201,6 +203,8 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
if title_level != 0: if title_level != 0:
para_content['text_level'] = title_level para_content['text_level'] = title_level
elif para_type == BlockType.INTERLINE_EQUATION: elif para_type == BlockType.INTERLINE_EQUATION:
if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
return None
para_content = { para_content = {
'type': 'equation', 'type': 'equation',
'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}", 'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
...@@ -263,6 +267,7 @@ def union_make(pdf_info_dict: list, ...@@ -263,6 +267,7 @@ def union_make(pdf_info_dict: list,
elif make_mode == MakeMode.CONTENT_LIST: elif make_mode == MakeMode.CONTENT_LIST:
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx) para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx)
if para_content:
output_content.append(para_content) output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment