You need to sign in or sign up before continuing.
Commit ecdaa49a authored by myhloli's avatar myhloli
Browse files

refactor(magic_pdf): remove unused functions and simplify code

parent 81635062
...@@ -34,13 +34,11 @@ except ImportError: ...@@ -34,13 +34,11 @@ except ImportError:
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.para.para_split_v3 import para_split from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.construct_page_dict import \ from magic_pdf.pre_proc.construct_page_dict import \
ocr_construct_page_component_v2 ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.equations_replace import (
combine_chars_to_pymudict, remove_chars_in_text_blocks,
replace_equations_in_textblock)
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2 ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks, from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
...@@ -49,26 +47,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks, ...@@ -49,26 +47,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
from magic_pdf.pre_proc.ocr_span_list_modify import ( from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans, get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans) remove_overlaps_min_spans)
from magic_pdf.pre_proc.resolve_bbox_conflict import \
check_useful_block_horizontal_overlap
def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({'bbox': bbox[:4]})
is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
check_useful_block_horizontal_overlap(useful_blocks)
)
if is_useful_block_horz_overlap:
logger.warning(
f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
) # noqa: E501
for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox)
return is_useful_block_horz_overlap, all_bboxes
def __replace_STX_ETX(text_str: str): def __replace_STX_ETX(text_str: str):
...@@ -264,39 +242,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -264,39 +242,6 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
return spans return spans
def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
'blocks'
]
text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
text_blocks = replace_equations_in_textblock(
text_blocks, inline_equations, interline_equations
)
text_blocks = remove_citation_marker(text_blocks)
text_blocks = remove_chars_in_text_blocks(text_blocks)
spans = []
for v in text_blocks:
for line in v['lines']:
for span in line['spans']:
bbox = span['bbox']
if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
continue
if span.get('type') not in (
ContentType.InlineEquation,
ContentType.InterlineEquation,
):
spans.append(
{
'bbox': list(span['bbox']),
'content': __replace_STX_ETX(span['text']),
'type': ContentType.Text,
'score': 1.0,
}
)
return spans
def replace_text_span(pymu_spans, ocr_spans): def replace_text_span(pymu_spans, ocr_spans):
return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
...@@ -722,10 +667,6 @@ def parse_page_core( ...@@ -722,10 +667,6 @@ def parse_page_core(
"""根据parse_mode,构造spans,主要是文本类的字符填充""" """根据parse_mode,构造spans,主要是文本类的字符填充"""
if parse_mode == SupportedPdfParseMethod.TXT: if parse_mode == SupportedPdfParseMethod.TXT:
"""之前的公式替换方案"""
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
# spans = replace_text_span(pymu_spans, spans)
"""使用新版本的混合ocr方案""" """使用新版本的混合ocr方案"""
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang) spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment