Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b03a7fae
Unverified
Commit
b03a7fae
authored
Nov 30, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 30, 2024
Browse files
Merge pull request #1153 from opendatalab/release-0.10.4
Release 0.10.4
parents
d19911f1
9726403c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
27 additions
and
16 deletions
+27
-16
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+25
-14
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+2
-2
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
b03a7fae
...
@@ -5,6 +5,7 @@ from loguru import logger
...
@@ -5,6 +5,7 @@ from loguru import logger
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.para.para_split_v3
import
ListLineTag
from
magic_pdf.para.para_split_v3
import
ListLineTag
...
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
...
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
def
merge_para_with_text
(
para_block
):
def
merge_para_with_text
(
para_block
):
block_text
=
''
for
line
in
para_block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
in
[
ContentType
.
Text
]:
block_text
+=
span
[
'content'
]
block_lang
=
detect_lang
(
block_text
)
para_text
=
''
para_text
=
''
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
if
i
>=
1
and
line
.
get
(
ListLineTag
.
IS_LIST_START_LINE
,
False
):
if
i
>=
1
and
line
.
get
(
ListLineTag
.
IS_LIST_START_LINE
,
False
):
para_text
+=
'
\n
'
para_text
+=
'
\n
'
line_text
=
''
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
for
j
,
span
in
enumerate
(
line
[
'spans'
]):
for
j
,
span
in
enumerate
(
line
[
'spans'
]):
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
...
@@ -159,12 +161,21 @@ def merge_para_with_text(para_block):
...
@@ -159,12 +161,21 @@ def merge_para_with_text(para_block):
content
=
f
"
\n
$$
\n
{
span
[
'content'
]
}
\n
$$
\n
"
content
=
f
"
\n
$$
\n
{
span
[
'content'
]
}
\n
$$
\n
"
content
=
content
.
strip
()
content
=
content
.
strip
()
if
content
!=
''
:
if
content
:
langs
=
[
'zh'
,
'ja'
,
'ko'
]
# logger.info(f'block_lang: {block_lang}, content: {content}')
if
block_lang
in
langs
:
# 中文/日语/韩文语境下,换行不需要空格分隔
if
j
==
len
(
line
[
'spans'
])
-
1
:
para_text
+=
content
else
:
para_text
+=
f
'
{
content
}
'
else
:
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if
j
==
len
(
line
[
'spans'
])
-
1
and
__is_hyphen_at_line_end
(
content
):
if
j
==
len
(
line
[
'spans'
])
-
1
and
span_type
==
ContentType
.
Text
and
__is_hyphen_at_line_end
(
content
):
para_text
+=
content
[:
-
1
]
para_text
+=
content
[:
-
1
]
else
:
# content间需要空格分隔
else
:
#
西方文本语境下
content间需要空格分隔
para_text
+=
f
'
{
content
}
'
para_text
+=
f
'
{
content
}
'
elif
span_type
==
ContentType
.
InterlineEquation
:
elif
span_type
==
ContentType
.
InterlineEquation
:
para_text
+=
content
para_text
+=
content
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
b03a7fae
...
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
...
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes
,
drop_reasons
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
#
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment