"...ci/git@developer.sourcefind.cn:OpenDAS/lietorch.git" did not exist on "266d4fd96e5c9dc7732582c384d38c7cde5401ce"
Commit 0d3ef89f authored by myhloli's avatar myhloli
Browse files

fix(pdf_parse): Move the logic for filling text content into spans before the...

fix(pdf_parse): Move the logic for filling text content into spans before the discarded_block recognition to fix the issue of empty text blocks in discarded_block.
parent e11e6b32
......@@ -682,6 +682,27 @@ def parse_page_core(
"""顺便删除大水印并保留abandon的span"""
spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
"""删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
"""删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
if parse_mode == SupportedPdfParseMethod.TXT:
"""之前的公式替换方案"""
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
# spans = replace_text_span(pymu_spans, spans)
"""使用新版本的混合ocr方案"""
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
elif parse_mode == SupportedPdfParseMethod.OCR:
pass
else:
raise Exception('parse_mode must be txt or ocr')
"""先处理不需要排版的discarded_blocks"""
discarded_block_with_spans, spans = fill_spans_in_blocks(
all_discarded_blocks, spans, 0.4
......@@ -706,26 +727,6 @@ def parse_page_core(
drop_reason,
)
"""删除重叠spans中置信度较低的那些"""
spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
"""删除重叠spans中较小的那些"""
spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
if parse_mode == SupportedPdfParseMethod.TXT:
"""之前的公式替换方案"""
# pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
# spans = replace_text_span(pymu_spans, spans)
"""ocr 中文本类的 span 用 pymu spans 替换!"""
spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
elif parse_mode == SupportedPdfParseMethod.OCR:
pass
else:
raise Exception('parse_mode must be txt or ocr')
"""对image和table截图"""
spans = ocr_cut_image_and_table(
spans, page_doc, page_id, pdf_bytes_md5, imageWriter
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment