Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
99192002
"references/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "8aad0e0a64c65b4e16855764288f0bc2ad2e14c0"
Commit
99192002
authored
Jun 05, 2025
by
myhloli
Browse files
refactor: improve character span calculation and sorting logic
parent
546be00a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
4 deletions
+8
-4
mineru/utils/span_pre_proc.py
mineru/utils/span_pre_proc.py
+8
-4
No files found.
mineru/utils/span_pre_proc.py
View file @
99192002
...
@@ -344,7 +344,8 @@ def fill_char_in_spans(spans, all_chars):
...
@@ -344,7 +344,8 @@ def fill_char_in_spans(spans, all_chars):
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
def
calculate_char_in_span
(
char_bbox
,
span_bbox
,
char
,
span_height_radio
=
0.33
):
Span_Height_Radio
=
0.33
# 字符的中轴和span的中轴高度差不能超过1/3span高度
def
calculate_char_in_span
(
char_bbox
,
span_bbox
,
char
,
span_height_radio
=
Span_Height_Radio
):
char_center_x
=
(
char_bbox
[
0
]
+
char_bbox
[
2
])
/
2
char_center_x
=
(
char_bbox
[
0
]
+
char_bbox
[
2
])
/
2
char_center_y
=
(
char_bbox
[
1
]
+
char_bbox
[
3
])
/
2
char_center_y
=
(
char_bbox
[
1
]
+
char_bbox
[
3
])
/
2
span_center_y
=
(
span_bbox
[
1
]
+
span_bbox
[
3
])
/
2
span_center_y
=
(
span_bbox
[
1
]
+
span_bbox
[
3
])
/
2
...
@@ -353,7 +354,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
...
@@ -353,7 +354,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
if
(
if
(
span_bbox
[
0
]
<
char_center_x
<
span_bbox
[
2
]
span_bbox
[
0
]
<
char_center_x
<
span_bbox
[
2
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
# 字符的中轴和span的中轴高度差不能超过
1/4span高度
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
# 字符的中轴和span的中轴高度差不能超过
Span_Height_Radio
):
):
return
True
return
True
else
:
else
:
...
@@ -385,7 +386,10 @@ def chars_to_content(span):
...
@@ -385,7 +386,10 @@ def chars_to_content(span):
pass
pass
else
:
else
:
# 先给chars按char['bbox']的中心点的x坐标排序
# 先给chars按char['bbox']的中心点的x坐标排序
span
[
'chars'
]
=
sorted
(
span
[
'chars'
],
key
=
lambda
x
:
(
x
[
'bbox'
][
0
]
+
x
[
'bbox'
][
2
])
/
2
)
# span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
# 给chars按char_idx排序
span
[
'chars'
]
=
sorted
(
span
[
'chars'
],
key
=
lambda
x
:
x
[
'char_idx'
])
# Calculate the width of each character
# Calculate the width of each character
char_widths
=
[
char
[
'bbox'
][
2
]
-
char
[
'bbox'
][
0
]
for
char
in
span
[
'chars'
]]
char_widths
=
[
char
[
'bbox'
][
2
]
-
char
[
'bbox'
][
0
]
for
char
in
span
[
'chars'
]]
...
@@ -393,7 +397,7 @@ def chars_to_content(span):
...
@@ -393,7 +397,7 @@ def chars_to_content(span):
median_width
=
statistics
.
median
(
char_widths
)
median_width
=
statistics
.
median
(
char_widths
)
# 通过x轴重叠比率移除一部分char
# 通过x轴重叠比率移除一部分char
span
=
remove_x_overlapping_chars
(
span
,
median_width
)
#
span = remove_x_overlapping_chars(span, median_width)
content
=
''
content
=
''
for
char
in
span
[
'chars'
]:
for
char
in
span
[
'chars'
]:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment