Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
5eb9feee
Unverified
Commit
5eb9feee
authored
Dec 19, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 19, 2024
Browse files
Merge pull request #1329 from myhloli/dev
feat(pre_proc): add function to remove overlapping characters in spans
parents
7248676d
2f4d4b0c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
1 deletion
+31
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+6
-1
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+25
-0
No files found.
magic_pdf/pdf_parse_union_core_v2.py
View file @
5eb9feee
...
@@ -34,7 +34,8 @@ from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_
...
@@ -34,7 +34,8 @@ from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans_v2
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans_v2
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
remove_overlaps_min_spans
from
magic_pdf.pre_proc.ocr_span_list_modify
import
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
\
remove_overlaps_min_spans
,
remove_overlaps_chars
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
...
@@ -120,6 +121,10 @@ def fill_char_in_spans(spans, all_chars):
...
@@ -120,6 +121,10 @@ def fill_char_in_spans(spans, all_chars):
empty_spans
=
[]
empty_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
# 移除同一个span中重叠的char
span
[
'chars'
]
=
remove_overlaps_chars
(
span
[
'chars'
])
chars_to_content
(
span
)
chars_to_content
(
span
)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if
len
(
span
[
'content'
])
*
span
[
'height'
]
<
span
[
'width'
]
*
0.5
:
if
len
(
span
[
'content'
])
*
span
[
'height'
]
<
span
[
'width'
]
*
0.5
:
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
5eb9feee
...
@@ -33,6 +33,31 @@ def remove_overlaps_low_confidence_spans(spans):
...
@@ -33,6 +33,31 @@ def remove_overlaps_low_confidence_spans(spans):
return
spans
,
dropped_spans
return
spans
,
dropped_spans
def
remove_overlaps_chars
(
chars
):
dropped_chars
=
[]
# 删除重叠的char
for
char1
in
chars
:
for
char2
in
chars
:
if
char1
!=
char2
:
# char1 或 char2 任何一个都不应该在 dropped_chars 中
if
char1
in
dropped_chars
or
char2
in
dropped_chars
:
continue
else
:
if
calculate_iou
(
char1
[
'bbox'
],
char2
[
'bbox'
])
>
0.95
:
char_need_remove
=
char1
if
(
char_need_remove
is
not
None
and
char_need_remove
not
in
dropped_chars
):
dropped_chars
.
append
(
char_need_remove
)
if
len
(
dropped_chars
)
>
0
:
for
char_need_remove
in
dropped_chars
:
chars
.
remove
(
char_need_remove
)
return
chars
def
remove_overlaps_min_spans
(
spans
):
def
remove_overlaps_min_spans
(
spans
):
dropped_spans
=
[]
dropped_spans
=
[]
# 删除重叠spans中较小的那些
# 删除重叠spans中较小的那些
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment