Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0281048d
Unverified
Commit
0281048d
authored
Dec 20, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 20, 2024
Browse files
Merge pull request #1338 from myhloli/dev
refactor(pre_proc): improve character overlap handling in spans
parents
58b2e78d
24dfd1a0
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
28 deletions
+9
-28
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+3
-5
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+6
-23
No files found.
magic_pdf/pdf_parse_union_core_v2.py
View file @
0281048d
...
...
@@ -35,7 +35,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans_v2
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
\
remove_overlaps_min_spans
,
remove_overlaps_chars
remove_overlaps_min_spans
,
check_chars_is_overlap_in_span
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
...
...
@@ -78,6 +78,8 @@ def chars_to_content(span):
if
len
(
span
[
'chars'
])
==
0
:
pass
# span['content'] = ''
elif
check_chars_is_overlap_in_span
(
span
[
'chars'
]):
pass
else
:
# 先给chars按char['bbox']的中心点的x坐标排序
span
[
'chars'
]
=
sorted
(
span
[
'chars'
],
key
=
lambda
x
:
(
x
[
'bbox'
][
0
]
+
x
[
'bbox'
][
2
])
/
2
)
...
...
@@ -121,10 +123,6 @@ def fill_char_in_spans(spans, all_chars):
empty_spans
=
[]
for
span
in
spans
:
# 移除同一个span中重叠的char
span
[
'chars'
]
=
remove_overlaps_chars
(
span
[
'chars'
])
chars_to_content
(
span
)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if
len
(
span
[
'content'
])
*
span
[
'height'
]
<
span
[
'width'
]
*
0.5
:
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
0281048d
...
...
@@ -33,29 +33,12 @@ def remove_overlaps_low_confidence_spans(spans):
return
spans
,
dropped_spans
def
remove_overlaps_chars
(
chars
):
dropped_chars
=
[]
# 删除重叠的char
for
char1
in
chars
:
for
char2
in
chars
:
if
char1
!=
char2
:
# char1 或 char2 任何一个都不应该在 dropped_chars 中
if
char1
in
dropped_chars
or
char2
in
dropped_chars
:
continue
else
:
if
calculate_iou
(
char1
[
'bbox'
],
char2
[
'bbox'
])
>
0.95
:
char_need_remove
=
char1
if
(
char_need_remove
is
not
None
and
char_need_remove
not
in
dropped_chars
):
dropped_chars
.
append
(
char_need_remove
)
if
len
(
dropped_chars
)
>
0
:
for
char_need_remove
in
dropped_chars
:
chars
.
remove
(
char_need_remove
)
return
chars
def
check_chars_is_overlap_in_span
(
chars
):
for
i
in
range
(
len
(
chars
)):
for
j
in
range
(
i
+
1
,
len
(
chars
)):
if
calculate_iou
(
chars
[
i
][
'bbox'
],
chars
[
j
][
'bbox'
])
>
0.9
:
return
True
return
False
def
remove_overlaps_min_spans
(
spans
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment