Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
68c45530
Unverified
Commit
68c45530
authored
Nov 29, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 29, 2024
Browse files
Merge pull request #1140 from myhloli/dev
refactor(pdf_parse): adjust character-axis alignment algorithm
parents
086b48b7
d4345b6e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
4 deletions
+4
-4
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+4
-4
No files found.
magic_pdf/pdf_parse_union_core_v2.py
View file @
68c45530
...
@@ -117,7 +117,7 @@ def fill_char_in_spans(spans, all_chars):
...
@@ -117,7 +117,7 @@ def fill_char_in_spans(spans, all_chars):
# 使用鲁棒性更强的中心点坐标判断
# 使用鲁棒性更强的中心点坐标判断
def
calculate_char_in_span
(
char_bbox
,
span_bbox
,
char
):
def
calculate_char_in_span
(
char_bbox
,
span_bbox
,
char
,
span_height_radio
=
0.33
):
char_center_x
=
(
char_bbox
[
0
]
+
char_bbox
[
2
])
/
2
char_center_x
=
(
char_bbox
[
0
]
+
char_bbox
[
2
])
/
2
char_center_y
=
(
char_bbox
[
1
]
+
char_bbox
[
3
])
/
2
char_center_y
=
(
char_bbox
[
1
]
+
char_bbox
[
3
])
/
2
span_center_y
=
(
span_bbox
[
1
]
+
span_bbox
[
3
])
/
2
span_center_y
=
(
span_bbox
[
1
]
+
span_bbox
[
3
])
/
2
...
@@ -126,7 +126,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
...
@@ -126,7 +126,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
if
(
if
(
span_bbox
[
0
]
<
char_center_x
<
span_bbox
[
2
]
span_bbox
[
0
]
<
char_center_x
<
span_bbox
[
2
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
/
4
# 字符的中轴和span的中轴高度差不能超过1/4span高度
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
# 字符的中轴和span的中轴高度差不能超过1/4span高度
):
):
return
True
return
True
else
:
else
:
...
@@ -137,7 +137,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
...
@@ -137,7 +137,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
(
span_bbox
[
2
]
-
span_height
)
<
char_bbox
[
0
]
<
span_bbox
[
2
]
(
span_bbox
[
2
]
-
span_height
)
<
char_bbox
[
0
]
<
span_bbox
[
2
]
and
char_center_x
>
span_bbox
[
0
]
and
char_center_x
>
span_bbox
[
0
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
/
4
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
):
):
return
True
return
True
elif
char
in
LINE_START_FLAG
:
elif
char
in
LINE_START_FLAG
:
...
@@ -145,7 +145,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
...
@@ -145,7 +145,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char):
span_bbox
[
0
]
<
char_bbox
[
2
]
<
(
span_bbox
[
0
]
+
span_height
)
span_bbox
[
0
]
<
char_bbox
[
2
]
<
(
span_bbox
[
0
]
+
span_height
)
and
char_center_x
<
span_bbox
[
2
]
and
char_center_x
<
span_bbox
[
2
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
/
4
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
):
):
return
True
return
True
else
:
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment