Commit b0fd7566 authored by myhloli's avatar myhloli
Browse files

refactor: update OCR handling and improve function parameters for clarity

parent 9bb25776
...@@ -10,6 +10,7 @@ from mineru.utils.block_pre_proc import prepare_block_bboxes, process_groups ...@@ -10,6 +10,7 @@ from mineru.utils.block_pre_proc import prepare_block_bboxes, process_groups
from mineru.utils.block_sort import sort_blocks_by_bbox from mineru.utils.block_sort import sort_blocks_by_bbox
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from mineru.utils.cut_image import cut_image_and_table from mineru.utils.cut_image import cut_image_and_table
from mineru.utils.enum_class import ContentType
from mineru.utils.llm_aided import llm_aided_title from mineru.utils.llm_aided import llm_aided_title
from mineru.utils.model_utils import clean_memory from mineru.utils.model_utils import clean_memory
from mineru.backend.pipeline.pipeline_magic_model import MagicModel from mineru.backend.pipeline.pipeline_magic_model import MagicModel
...@@ -20,7 +21,7 @@ from mineru.version import __version__ ...@@ -20,7 +21,7 @@ from mineru.version import __version__
from mineru.utils.hash_utils import str_md5 from mineru.utils.hash_utils import str_md5
def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, ocr=False): def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, ocr_enable=False, formula_enabled=True):
scale = image_dict["scale"] scale = image_dict["scale"]
page_pil_img = image_dict["img_pil"] page_pil_img = image_dict["img_pil"]
page_img_md5 = str_md5(image_dict["img_base64"]) page_img_md5 = str_md5(image_dict["img_base64"])
...@@ -62,7 +63,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -62,7 +63,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
block_area = (block['bbox'][2] - block['bbox'][0]) * (block['bbox'][3] - block['bbox'][1]) block_area = (block['bbox'][2] - block['bbox'][0]) * (block['bbox'][3] - block['bbox'][1])
if block_area > 0: if block_area > 0:
ratio = spans_area / block_area ratio = spans_area / block_area
if ratio > 0.25 and ocr: if ratio > 0.25 and ocr_enable:
# 移除block的group_id # 移除block的group_id
block.pop('group_id', None) block.pop('group_id', None)
# 符合文本图的条件就把块加入到文本块列表中 # 符合文本图的条件就把块加入到文本块列表中
...@@ -75,8 +76,18 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -75,8 +76,18 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
"""将所有区块的bbox整理到一起""" """将所有区块的bbox整理到一起"""
interline_equation_blocks = [] if formula_enabled:
interline_equation_blocks = []
if len(interline_equation_blocks) > 0: if len(interline_equation_blocks) > 0:
for block in interline_equation_blocks:
spans.append({
"type": ContentType.INTERLINE_EQUATION,
'score': block['score'],
"bbox": block['bbox'],
})
all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes( all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes(
img_body_blocks, img_caption_blocks, img_footnote_blocks, img_body_blocks, img_caption_blocks, img_footnote_blocks,
table_body_blocks, table_caption_blocks, table_footnote_blocks, table_body_blocks, table_caption_blocks, table_footnote_blocks,
...@@ -109,7 +120,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -109,7 +120,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
"""根据parse_mode,构造spans,主要是文本类的字符填充""" """根据parse_mode,构造spans,主要是文本类的字符填充"""
if ocr: if ocr_enable:
pass pass
else: else:
"""使用新版本的混合ocr方案.""" """使用新版本的混合ocr方案."""
...@@ -125,9 +136,9 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -125,9 +136,9 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
if len(all_bboxes) == 0: if len(all_bboxes) == 0:
return None return None
"""对imagetable截图""" """对image/table/interline_equation截图"""
for span in spans: for span in spans:
if span['type'] in ['image', 'table']: if span['type'] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
span = cut_image_and_table( span = cut_image_and_table(
span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale
) )
...@@ -150,13 +161,13 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer ...@@ -150,13 +161,13 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
return page_info return page_info
def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr=False): def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False):
middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__} middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__}
for page_index, page_model_info in enumerate(model_list): for page_index, page_model_info in enumerate(model_list):
page = pdf_doc[page_index] page = pdf_doc[page_index]
image_dict = images_list[page_index] image_dict = images_list[page_index]
page_info = page_model_info_to_page_info( page_info = page_model_info_to_page_info(
page_model_info, image_dict, page, image_writer, page_index, ocr=ocr page_model_info, image_dict, page, image_writer, page_index, ocr_enable=ocr_enable
) )
if page_info is None: if page_info is None:
page_w, page_h = map(int, page.get_size()) page_w, page_h = map(int, page.get_size())
......
...@@ -34,7 +34,10 @@ def make_blocks_to_markdown(paras_of_layout, ...@@ -34,7 +34,10 @@ def make_blocks_to_markdown(paras_of_layout,
title_level = get_title_level(para_block) title_level = get_title_level(para_block)
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}' para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
elif para_type == BlockType.INTERLINE_EQUATION: elif para_type == BlockType.INTERLINE_EQUATION:
para_text = merge_para_with_text(para_block) if para_block['lines'][0]['spans'][0].get('content', ''):
para_text = merge_para_with_text(para_block)
else:
para_text += f"![]({img_buket_path}/{para_block['lines'][0]['spans'][0]['image_path']})"
elif para_type == BlockType.IMAGE: elif para_type == BlockType.IMAGE:
if mode == MakeMode.NLP_MD: if mode == MakeMode.NLP_MD:
continue continue
...@@ -200,9 +203,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx): ...@@ -200,9 +203,11 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx):
elif para_type == BlockType.INTERLINE_EQUATION: elif para_type == BlockType.INTERLINE_EQUATION:
para_content = { para_content = {
'type': 'equation', 'type': 'equation',
'text': merge_para_with_text(para_block), 'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
'text_format': 'latex',
} }
if para_block['lines'][0]['spans'][0].get('content', ''):
para_content['text'] = merge_para_with_text(para_block)
para_content['text_format'] = 'latex'
elif para_type == BlockType.IMAGE: elif para_type == BlockType.IMAGE:
para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []} para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
for block in para_block['blocks']: for block in para_block['blocks']:
......
...@@ -110,8 +110,8 @@ def do_parse( ...@@ -110,8 +110,8 @@ def do_parse(
images_list = all_image_lists[idx] images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx] pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx] _lang = lang_list[idx]
_ocr = ocr_enabled_list[idx] _ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr) middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable)
pdf_info = middle_json["pdf_info"] pdf_info = middle_json["pdf_info"]
...@@ -215,8 +215,8 @@ def do_parse( ...@@ -215,8 +215,8 @@ def do_parse(
if __name__ == "__main__": if __name__ == "__main__":
# pdf_path = "../../demo/pdfs/hello-algo-1.1.0-zh-c-word转换的span有问题.pdf" pdf_path = "../../demo/pdfs/demo2.pdf"
pdf_path = "C:/Users/zhaoxiaomeng/Downloads/数学新星问题征解第一期(2014.03).pdf" # pdf_path = "C:/Users/zhaoxiaomeng/Downloads/数学新星问题征解第一期(2014.03).pdf"
try: try:
do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,) do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], end_page_id=20,)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment