Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1cb79e7d
Unverified
Commit
1cb79e7d
authored
Mar 13, 2024
by
myhloli
Committed by
GitHub
Mar 13, 2024
Browse files
Merge pull request #5 from myhloli/dev-in-line-bbox
Dev in line bbox
parents
6f7aa890
8f264082
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
4 deletions
+11
-4
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+11
-4
No files found.
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
1cb79e7d
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
from
magic_pdf.libs.boxbase
import
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
get_minbox_if_overlap_by_ratio
,
\
__is_overlaps_y_exceeds_threshold
__is_overlaps_y_exceeds_threshold
...
@@ -128,16 +130,21 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
...
@@ -128,16 +130,21 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
text_line
=
text_inline_lines
[
j
]
text_line
=
text_inline_lines
[
j
]
y0
,
y1
=
text_line
[
1
]
y0
,
y1
=
text_line
[
1
]
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
if
(
span_y0
<
y0
and
span_y
>
y0
or
span_y0
<
y1
and
span_y
>
y1
or
span_y0
<
y0
and
span_y
>
y1
)
and
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
span
[
"bbox"
][
1
]
=
y0
# span["bbox"][3] = y1
#调整公式类型
# 调整公式类型
if
span
[
"type"
]
==
"displayed_equation"
:
if
span
[
"type"
]
==
"displayed_equation"
:
#最后一行是行间公式
if
j
+
1
>=
len
(
text_inline_lines
):
if
j
+
1
>=
len
(
text_inline_lines
):
span
[
"type"
]
=
"inline_equation"
span
[
"type"
]
=
"inline_equation"
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
else
:
else
:
#行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
y0_next
,
y1_next
=
text_inline_lines
[
j
+
1
][
1
]
y0_next
,
y1_next
=
text_inline_lines
[
j
+
1
][
1
]
if
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0_next
,
0
,
y1_next
)):
if
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0_next
,
0
,
y1_next
))
and
3
*
(
y1
-
y0
)
>
span_y
-
span_y0
:
span
[
"type"
]
=
"inline_equation"
span
[
"type"
]
=
"inline_equation"
span
[
"bbox"
][
1
]
=
y0
span
[
"bbox"
][
3
]
=
y1
break
break
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
elif
span_y
<
y0
or
span_y0
<
y0
and
span_y
>
y0
and
not
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
(
0
,
y0
,
0
,
y1
)):
break
break
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment