Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
002333a8
Commit
002333a8
authored
May 19, 2025
by
myhloli
Browse files
fix(ocr_mkcontent): improve image handling and footnote integration in markdown output
parent
e3f22e84
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
36 additions
and
22 deletions
+36
-22
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+36
-22
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
002333a8
...
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
elif
mode
==
'mm'
:
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼image_body
# 检测是否存在图片脚注
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
has_image_footnote
=
any
(
block
[
'type'
]
==
BlockType
.
ImageFootnote
for
block
in
para_block
[
'blocks'
])
for
line
in
block
[
'lines'
]:
# 如果存在图片脚注,则将图片脚注拼接到图片正文后面
for
span
in
line
[
'spans'
]:
if
has_image_footnote
:
if
span
[
'type'
]
==
ContentType
.
Image
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼image_caption
if
span
.
get
(
'image_path'
,
''
):
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
f
"
\n

}
)
\n
"
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_body
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
for
line
in
block
[
'lines'
]:
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼image_footnote
for
span
in
line
[
'spans'
]:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
span
[
'type'
]
==
ContentType
.
Image
:
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
if
span
.
get
(
'image_path'
,
''
):
para_text
+=
f
""
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼image_footnote
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_text
+=
'
\n
'
+
merge_para_with_text
(
block
)
else
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼image_body
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Image
:
if
span
.
get
(
'image_path'
,
''
):
para_text
+=
f
""
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
'
\n
'
+
merge_para_with_text
(
block
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
...
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
for
span
in
line
[
'spans'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Table
:
if
span
[
'type'
]
==
ContentType
.
Table
:
# if processed by table model
# if processed by table model
if
span
.
get
(
'latex'
,
''
):
if
span
.
get
(
'html'
,
''
):
para_text
+=
f
"
\n\n
$
\n
{
span
[
'latex'
]
}
\n
$
\n\n
"
para_text
+=
f
"
\n
{
span
[
'html'
]
}
\n
"
elif
span
.
get
(
'html'
,
''
):
para_text
+=
f
"
\n\n
{
span
[
'html'
]
}
\n\n
"
elif
span
.
get
(
'image_path'
,
''
):
elif
span
.
get
(
'image_path'
,
''
):
para_text
+=
f
"
\n

}
)
\n
"
para_text
+=
f
""
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
para_text
+=
'
\n
'
+
merge_para_with_text
(
block
)
+
' '
if
para_text
.
strip
()
==
''
:
if
para_text
.
strip
()
==
''
:
continue
continue
else
:
else
:
page_markdown
.
append
(
para_text
.
strip
()
+
' '
)
# page_markdown.append(para_text.strip() + ' ')
page_markdown
.
append
(
para_text
.
strip
())
return
page_markdown
return
page_markdown
...
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
if
span
[
'type'
]
==
ContentType
.
Table
:
if
span
[
'type'
]
==
ContentType
.
Table
:
if
span
.
get
(
'latex'
,
''
):
if
span
.
get
(
'latex'
,
''
):
para_content
[
'table_body'
]
=
f
"
\n\n
$
\n
{
span
[
'latex'
]
}
\n
$
\n\n
"
para_content
[
'table_body'
]
=
f
"
{
span
[
'latex'
]
}
"
elif
span
.
get
(
'html'
,
''
):
elif
span
.
get
(
'html'
,
''
):
para_content
[
'table_body'
]
=
f
"
\n\n
{
span
[
'html'
]
}
\n\n
"
para_content
[
'table_body'
]
=
f
"
{
span
[
'html'
]
}
"
if
span
.
get
(
'image_path'
,
''
):
if
span
.
get
(
'image_path'
,
''
):
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
span
[
'image_path'
])
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
span
[
'image_path'
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment