Commit 2f4d4b0c authored by myhloli's avatar myhloli
Browse files

feat(pre_proc): add function to remove overlapping characters in spans

- Implement remove_overlaps_chars function to detect and remove overlapping characters within spans
- Integrate remove_overlaps_chars function into the PDF parsing process
- Improve character-level processing and reduce redundancy in OCR results
parent 7248676d
...@@ -34,7 +34,8 @@ from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_ ...@@ -34,7 +34,8 @@ from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, remove_overlaps_chars
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
...@@ -120,6 +121,10 @@ def fill_char_in_spans(spans, all_chars): ...@@ -120,6 +121,10 @@ def fill_char_in_spans(spans, all_chars):
empty_spans = [] empty_spans = []
for span in spans: for span in spans:
# 移除同一个span中重叠的char
span['chars'] = remove_overlaps_chars(span['chars'])
chars_to_content(span) chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤 # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5: if len(span['content']) * span['height'] < span['width'] * 0.5:
......
...@@ -33,6 +33,31 @@ def remove_overlaps_low_confidence_spans(spans): ...@@ -33,6 +33,31 @@ def remove_overlaps_low_confidence_spans(spans):
return spans, dropped_spans return spans, dropped_spans
def remove_overlaps_chars(chars):
dropped_chars = []
# 删除重叠的char
for char1 in chars:
for char2 in chars:
if char1 != char2:
# char1 或 char2 任何一个都不应该在 dropped_chars 中
if char1 in dropped_chars or char2 in dropped_chars:
continue
else:
if calculate_iou(char1['bbox'], char2['bbox']) > 0.95:
char_need_remove = char1
if (
char_need_remove is not None
and char_need_remove not in dropped_chars
):
dropped_chars.append(char_need_remove)
if len(dropped_chars) > 0:
for char_need_remove in dropped_chars:
chars.remove(char_need_remove)
return chars
def remove_overlaps_min_spans(spans): def remove_overlaps_min_spans(spans):
dropped_spans = [] dropped_spans = []
# 删除重叠spans中较小的那些 # 删除重叠spans中较小的那些
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment