"vscode:/vscode.git/clone" did not exist on "e00f6f63ca21af8e6ed01ea35ef27d608fb71d78"
Unverified Commit 0281048d authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1338 from myhloli/dev

refactor(pre_proc): improve character overlap handling in spans 
parents 58b2e78d 24dfd1a0
...@@ -35,7 +35,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table ...@@ -35,7 +35,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \ from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, remove_overlaps_chars remove_overlaps_min_spans, check_chars_is_overlap_in_span
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
...@@ -78,6 +78,8 @@ def chars_to_content(span): ...@@ -78,6 +78,8 @@ def chars_to_content(span):
if len(span['chars']) == 0: if len(span['chars']) == 0:
pass pass
# span['content'] = '' # span['content'] = ''
elif check_chars_is_overlap_in_span(span['chars']):
pass
else: else:
# 先给chars按char['bbox']的中心点的x坐标排序 # 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2) span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
...@@ -121,10 +123,6 @@ def fill_char_in_spans(spans, all_chars): ...@@ -121,10 +123,6 @@ def fill_char_in_spans(spans, all_chars):
empty_spans = [] empty_spans = []
for span in spans: for span in spans:
# 移除同一个span中重叠的char
span['chars'] = remove_overlaps_chars(span['chars'])
chars_to_content(span) chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤 # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5: if len(span['content']) * span['height'] < span['width'] * 0.5:
......
...@@ -33,29 +33,12 @@ def remove_overlaps_low_confidence_spans(spans): ...@@ -33,29 +33,12 @@ def remove_overlaps_low_confidence_spans(spans):
return spans, dropped_spans return spans, dropped_spans
def remove_overlaps_chars(chars): def check_chars_is_overlap_in_span(chars):
dropped_chars = [] for i in range(len(chars)):
# 删除重叠的char for j in range(i + 1, len(chars)):
for char1 in chars: if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
for char2 in chars: return True
if char1 != char2: return False
# char1 或 char2 任何一个都不应该在 dropped_chars 中
if char1 in dropped_chars or char2 in dropped_chars:
continue
else:
if calculate_iou(char1['bbox'], char2['bbox']) > 0.95:
char_need_remove = char1
if (
char_need_remove is not None
and char_need_remove not in dropped_chars
):
dropped_chars.append(char_need_remove)
if len(dropped_chars) > 0:
for char_need_remove in dropped_chars:
chars.remove(char_need_remove)
return chars
def remove_overlaps_min_spans(spans): def remove_overlaps_min_spans(spans):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment