Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3879bf8d
Unverified
Commit
3879bf8d
authored
Oct 28, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 28, 2024
Browse files
Merge pull request #804 from icecraft/fix/match_figure_caption
fix: add priority match rule
parents
3a166bf1
34a13a89
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
61 additions
and
8 deletions
+61
-8
magic_pdf/model/magic_model.py
magic_pdf/model/magic_model.py
+61
-8
No files found.
magic_pdf/model/magic_model.py
View file @
3879bf8d
import
enum
import
json
import
json
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
...
@@ -18,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
...
@@ -18,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
MERGE_BOX_OVERLAP_AREA_RATIO
=
1.1
MERGE_BOX_OVERLAP_AREA_RATIO
=
1.1
class
PosRelationEnum
(
enum
.
Enum
):
LEFT
=
'left'
RIGHT
=
'right'
UP
=
'up'
BOTTOM
=
'bottom'
ALL
=
'all'
class
MagicModel
:
class
MagicModel
:
"""每个函数没有得到元素的时候返回空list."""
"""每个函数没有得到元素的时候返回空list."""
...
@@ -591,9 +600,23 @@ class MagicModel:
...
@@ -591,9 +600,23 @@ class MagicModel:
return
ret
,
total_subject_object_dis
return
ret
,
total_subject_object_dis
def
__tie_up_category_by_distance_v2
(
def
__tie_up_category_by_distance_v2
(
self
,
page_no
,
subject_category_id
,
object_category_id
self
,
page_no
:
int
,
subject_category_id
:
int
,
object_category_id
:
int
,
priority_pos
:
PosRelationEnum
,
):
):
"""_summary_
Args:
page_no (int): _description_
subject_category_id (int): _description_
object_category_id (int): _description_
priority_pos (PosRelationEnum): _description_
Returns:
_type_: _description_
"""
AXIS_MULPLICITY
=
0.5
AXIS_MULPLICITY
=
0.5
subjects
=
self
.
__reduct_overlap
(
subjects
=
self
.
__reduct_overlap
(
list
(
list
(
...
@@ -680,6 +703,27 @@ class MagicModel:
...
@@ -680,6 +703,27 @@ class MagicModel:
j
,
j
,
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
]
]
if
(
dis_by_directions
[
'top'
][
i
][
1
]
!=
float
(
'inf'
)
and
dis_by_directions
[
'bottom'
][
i
][
1
]
!=
float
(
'inf'
)
and
priority_pos
in
(
PosRelationEnum
.
BOTTOM
,
PosRelationEnum
.
UP
)
):
RATIO
=
3
if
(
abs
(
dis_by_directions
[
'top'
][
i
][
1
]
-
dis_by_directions
[
'bottom'
][
i
][
1
]
)
<
RATIO
*
axis_unit
):
if
priority_pos
==
PosRelationEnum
.
BOTTOM
:
sub_obj_map_h
[
dis_by_directions
[
'bottom'
][
i
][
0
]].
append
(
i
)
else
:
sub_obj_map_h
[
dis_by_directions
[
'top'
][
i
][
0
]].
append
(
i
)
continue
if
dis_by_directions
[
'left'
][
i
][
1
]
!=
float
(
'inf'
)
or
dis_by_directions
[
if
dis_by_directions
[
'left'
][
i
][
1
]
!=
float
(
'inf'
)
or
dis_by_directions
[
'right'
'right'
][
i
][
1
]
!=
float
(
'inf'
):
][
i
][
1
]
!=
float
(
'inf'
):
...
@@ -735,9 +779,12 @@ class MagicModel:
...
@@ -735,9 +779,12 @@ class MagicModel:
top_bottom_x_axis
=
top_bottom
[
2
]
-
top_bottom
[
0
]
top_bottom_x_axis
=
top_bottom
[
2
]
-
top_bottom
[
0
]
bottom_top_x_axis
=
bottom_top
[
2
]
-
bottom_top
[
0
]
bottom_top_x_axis
=
bottom_top
[
2
]
-
bottom_top
[
0
]
if
abs
(
top_bottom_x_axis
-
l_x_axis
)
+
dis_by_directions
[
'bottom'
][
i
][
1
]
>
abs
(
if
(
bottom_top_x_axis
-
l_x_axis
abs
(
top_bottom_x_axis
-
l_x_axis
)
)
+
dis_by_directions
[
'top'
][
i
][
1
]:
+
dis_by_directions
[
'bottom'
][
i
][
1
]
>
abs
(
bottom_top_x_axis
-
l_x_axis
)
+
dis_by_directions
[
'top'
][
i
][
1
]
):
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
else
:
else
:
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
...
@@ -798,9 +845,11 @@ class MagicModel:
...
@@ -798,9 +845,11 @@ class MagicModel:
return
ret
return
ret
def
get_imgs_v2
(
self
,
page_no
:
int
):
def
get_imgs_v2
(
self
,
page_no
:
int
):
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
4
)
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
4
,
PosRelationEnum
.
BOTTOM
)
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
CategoryId
.
ImageFootnote
page_no
,
3
,
CategoryId
.
ImageFootnote
,
PosRelationEnum
.
ALL
)
)
ret
=
[]
ret
=
[]
for
v
in
with_captions
:
for
v
in
with_captions
:
...
@@ -815,8 +864,12 @@ class MagicModel:
...
@@ -815,8 +864,12 @@ class MagicModel:
return
ret
return
ret
def
get_tables_v2
(
self
,
page_no
:
int
)
->
list
:
def
get_tables_v2
(
self
,
page_no
:
int
)
->
list
:
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
6
)
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
7
)
page_no
,
5
,
6
,
PosRelationEnum
.
UP
)
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
7
,
PosRelationEnum
.
ALL
)
ret
=
[]
ret
=
[]
for
v
in
with_captions
:
for
v
in
with_captions
:
record
=
{
record
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment