Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b03a7fae
"tests/vscode:/vscode.git/clone" did not exist on "c9a51491a42b088d116ef21dd114c48fab28bf9d"
Unverified
Commit
b03a7fae
authored
Nov 30, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Nov 30, 2024
Browse files
Merge pull request #1153 from opendatalab/release-0.10.4
Release 0.10.4
parents
d19911f1
9726403c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
27 additions
and
16 deletions
+27
-16
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+25
-14
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+2
-2
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
b03a7fae
...
@@ -5,6 +5,7 @@ from loguru import logger
...
@@ -5,6 +5,7 @@ from loguru import logger
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.para.para_split_v3
import
ListLineTag
from
magic_pdf.para.para_split_v3
import
ListLineTag
...
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
...
@@ -135,18 +136,19 @@ def __replace_ligatures(text: str):
def
merge_para_with_text
(
para_block
):
def
merge_para_with_text
(
para_block
):
block_text
=
''
for
line
in
para_block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
in
[
ContentType
.
Text
]:
block_text
+=
span
[
'content'
]
block_lang
=
detect_lang
(
block_text
)
para_text
=
''
para_text
=
''
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
for
i
,
line
in
enumerate
(
para_block
[
'lines'
]):
if
i
>=
1
and
line
.
get
(
ListLineTag
.
IS_LIST_START_LINE
,
False
):
if
i
>=
1
and
line
.
get
(
ListLineTag
.
IS_LIST_START_LINE
,
False
):
para_text
+=
'
\n
'
para_text
+=
'
\n
'
line_text
=
''
for
span
in
line
[
'spans'
]:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Text
:
line_text
+=
span
[
'content'
].
strip
()
for
j
,
span
in
enumerate
(
line
[
'spans'
]):
for
j
,
span
in
enumerate
(
line
[
'spans'
]):
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
...
@@ -159,12 +161,21 @@ def merge_para_with_text(para_block):
...
@@ -159,12 +161,21 @@ def merge_para_with_text(para_block):
content
=
f
"
\n
$$
\n
{
span
[
'content'
]
}
\n
$$
\n
"
content
=
f
"
\n
$$
\n
{
span
[
'content'
]
}
\n
$$
\n
"
content
=
content
.
strip
()
content
=
content
.
strip
()
if
content
!=
''
:
if
content
:
langs
=
[
'zh'
,
'ja'
,
'ko'
]
# logger.info(f'block_lang: {block_lang}, content: {content}')
if
block_lang
in
langs
:
# 中文/日语/韩文语境下,换行不需要空格分隔
if
j
==
len
(
line
[
'spans'
])
-
1
:
para_text
+=
content
else
:
para_text
+=
f
'
{
content
}
'
else
:
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
if
span_type
in
[
ContentType
.
Text
,
ContentType
.
InlineEquation
]:
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
if
j
==
len
(
line
[
'spans'
])
-
1
and
__is_hyphen_at_line_end
(
content
):
if
j
==
len
(
line
[
'spans'
])
-
1
and
span_type
==
ContentType
.
Text
and
__is_hyphen_at_line_end
(
content
):
para_text
+=
content
[:
-
1
]
para_text
+=
content
[:
-
1
]
else
:
# content间需要空格分隔
else
:
#
西方文本语境下
content间需要空格分隔
para_text
+=
f
'
{
content
}
'
para_text
+=
f
'
{
content
}
'
elif
span_type
==
ContentType
.
InterlineEquation
:
elif
span_type
==
ContentType
.
InterlineEquation
:
para_text
+=
content
para_text
+=
content
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
b03a7fae
...
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
...
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
all_discarded_blocks
=
remove_overlaps_min_blocks
(
all_discarded_blocks
)
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
all_bboxes
,
drop_reasons
=
remove_overlap_between_bbox_for_block
(
all_bboxes
)
#
all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment