Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
34e5d2ff
Commit
34e5d2ff
authored
Jun 26, 2025
by
myhloli
Browse files
feat: implement Gradio app for PDF extraction and add HTML header
parent
9a9285a5
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
380 additions
and
1 deletion
+380
-1
mineru/cli/fast_api.py
mineru/cli/fast_api.py
+0
-1
mineru/cli/gradio_app.py
mineru/cli/gradio_app.py
+250
-0
mineru/resources/header.html
mineru/resources/header.html
+130
-0
No files found.
mineru/cli/fast_api.py
View file @
34e5d2ff
...
@@ -150,7 +150,6 @@ async def parse_pdf(
...
@@ -150,7 +150,6 @@ async def parse_pdf(
return
JSONResponse
(
return
JSONResponse
(
status_code
=
200
,
status_code
=
200
,
content
=
{
content
=
{
"status"
:
"success"
,
"backend"
:
backend
,
"backend"
:
backend
,
"version"
:
__version__
,
"version"
:
__version__
,
"results"
:
result_dict
"results"
:
result_dict
...
...
mineru/cli/gradio_app.py
0 → 100644
View file @
34e5d2ff
# Copyright (c) Opendatalab. All rights reserved.
import
base64
import
os
import
re
import
time
import
zipfile
from
pathlib
import
Path
import
gradio
as
gr
from
gradio_pdf
import
PDF
from
loguru
import
logger
from
mineru.cli.common
import
prepare_env
,
do_parse
,
read_fn
from
mineru.utils.hash_utils
import
str_sha256
def
parse_pdf
(
doc_path
,
output_dir
,
end_page_id
,
is_ocr
,
formula_enable
,
table_enable
,
language
,
backend
,
url
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
try
:
file_name
=
f
'
{
str
(
Path
(
doc_path
).
stem
)
}
_
{
time
.
strftime
(
"%y%m%d_%H%M%S"
)
}
'
pdf_data
=
read_fn
(
doc_path
)
if
is_ocr
:
parse_method
=
'ocr'
else
:
parse_method
=
'auto'
if
backend
.
startswith
(
"vlm"
):
parse_method
=
"vlm"
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
file_name
,
parse_method
)
do_parse
(
output_dir
=
output_dir
,
pdf_file_names
=
[
file_name
],
pdf_bytes_list
=
[
pdf_data
],
p_lang_list
=
[
language
],
parse_method
=
parse_method
,
end_page_id
=
end_page_id
,
p_formula_enable
=
formula_enable
,
p_table_enable
=
table_enable
,
backend
=
backend
,
server_url
=
url
,
)
return
local_md_dir
,
file_name
except
Exception
as
e
:
logger
.
exception
(
e
)
def
compress_directory_to_zip
(
directory_path
,
output_zip_path
):
"""压缩指定目录到一个 ZIP 文件。
:param directory_path: 要压缩的目录路径
:param output_zip_path: 输出的 ZIP 文件路径
"""
try
:
with
zipfile
.
ZipFile
(
output_zip_path
,
'w'
,
zipfile
.
ZIP_DEFLATED
)
as
zipf
:
# 遍历目录中的所有文件和子目录
for
root
,
dirs
,
files
in
os
.
walk
(
directory_path
):
for
file
in
files
:
# 构建完整的文件路径
file_path
=
os
.
path
.
join
(
root
,
file
)
# 计算相对路径
arcname
=
os
.
path
.
relpath
(
file_path
,
directory_path
)
# 添加文件到 ZIP 文件
zipf
.
write
(
file_path
,
arcname
)
return
0
except
Exception
as
e
:
logger
.
exception
(
e
)
return
-
1
def
image_to_base64
(
image_path
):
with
open
(
image_path
,
'rb'
)
as
image_file
:
return
base64
.
b64encode
(
image_file
.
read
()).
decode
(
'utf-8'
)
def
replace_image_with_base64
(
markdown_text
,
image_dir_path
):
# 匹配Markdown中的图片标签
pattern
=
r
'\!\[(?:[^\]]*)\]\(([^)]+)\)'
# 替换图片链接
def
replace
(
match
):
relative_path
=
match
.
group
(
1
)
full_path
=
os
.
path
.
join
(
image_dir_path
,
relative_path
)
base64_image
=
image_to_base64
(
full_path
)
return
f
''
# 应用替换
return
re
.
sub
(
pattern
,
replace
,
markdown_text
)
def
to_markdown
(
file_path
,
end_pages
,
is_ocr
,
formula_enable
,
table_enable
,
language
,
backend
,
url
):
file_path
=
to_pdf
(
file_path
)
# 获取识别的md文件以及压缩包文件路径
local_md_dir
,
file_name
=
parse_pdf
(
file_path
,
'./output'
,
end_pages
-
1
,
is_ocr
,
formula_enable
,
table_enable
,
language
,
backend
,
url
)
archive_zip_path
=
os
.
path
.
join
(
'./output'
,
str_sha256
(
local_md_dir
)
+
'.zip'
)
zip_archive_success
=
compress_directory_to_zip
(
local_md_dir
,
archive_zip_path
)
if
zip_archive_success
==
0
:
logger
.
info
(
'压缩成功'
)
else
:
logger
.
error
(
'压缩失败'
)
md_path
=
os
.
path
.
join
(
local_md_dir
,
file_name
+
'.md'
)
with
open
(
md_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
txt_content
=
f
.
read
()
md_content
=
replace_image_with_base64
(
txt_content
,
local_md_dir
)
# 返回转换后的PDF路径
new_pdf_path
=
os
.
path
.
join
(
local_md_dir
,
file_name
+
'_layout.pdf'
)
return
md_content
,
txt_content
,
archive_zip_path
,
new_pdf_path
latex_delimiters
=
[
{
'left'
:
'$$'
,
'right'
:
'$$'
,
'display'
:
True
},
{
'left'
:
'$'
,
'right'
:
'$'
,
'display'
:
False
},
{
'left'
:
'
\\
('
,
'right'
:
'
\\
)'
,
'display'
:
False
},
{
'left'
:
'
\\
['
,
'right'
:
'
\\
]'
,
'display'
:
True
},
]
header_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
)),
'resources'
,
'header.html'
)
with
open
(
header_path
,
'r'
)
as
file
:
header
=
file
.
read
()
latin_lang
=
[
'af'
,
'az'
,
'bs'
,
'cs'
,
'cy'
,
'da'
,
'de'
,
'es'
,
'et'
,
'fr'
,
'ga'
,
'hr'
,
# noqa: E126
'hu'
,
'id'
,
'is'
,
'it'
,
'ku'
,
'la'
,
'lt'
,
'lv'
,
'mi'
,
'ms'
,
'mt'
,
'nl'
,
'no'
,
'oc'
,
'pi'
,
'pl'
,
'pt'
,
'ro'
,
'rs_latin'
,
'sk'
,
'sl'
,
'sq'
,
'sv'
,
'sw'
,
'tl'
,
'tr'
,
'uz'
,
'vi'
,
'french'
,
'german'
]
arabic_lang
=
[
'ar'
,
'fa'
,
'ug'
,
'ur'
]
cyrillic_lang
=
[
'ru'
,
'rs_cyrillic'
,
'be'
,
'bg'
,
'uk'
,
'mn'
,
'abq'
,
'ady'
,
'kbd'
,
'ava'
,
# noqa: E126
'dar'
,
'inh'
,
'che'
,
'lbe'
,
'lez'
,
'tab'
]
devanagari_lang
=
[
'hi'
,
'mr'
,
'ne'
,
'bh'
,
'mai'
,
'ang'
,
'bho'
,
'mah'
,
'sck'
,
'new'
,
'gom'
,
# noqa: E126
'sa'
,
'bgc'
]
other_lang
=
[
'ch'
,
'ch_lite'
,
'ch_server'
,
'en'
,
'korean'
,
'japan'
,
'chinese_cht'
,
'ta'
,
'te'
,
'ka'
]
add_lang
=
[
'latin'
,
'arabic'
,
'cyrillic'
,
'devanagari'
]
# all_lang = ['', 'auto']
all_lang
=
[]
# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
all_lang
.
extend
([
*
other_lang
,
*
add_lang
])
def
safe_stem
(
file_path
):
stem
=
Path
(
file_path
).
stem
# 只保留字母、数字、下划线和点,其他字符替换为下划线
return
re
.
sub
(
r
'[^\w.]'
,
'_'
,
stem
)
def
to_pdf
(
file_path
):
if
file_path
is
None
:
return
None
pdf_bytes
=
read_fn
(
file_path
)
# unique_filename = f'{uuid.uuid4()}.pdf'
unique_filename
=
f
'
{
safe_stem
(
file_path
)
}
.pdf'
# 构建完整的文件路径
tmp_file_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
file_path
),
unique_filename
)
# 将字节数据写入文件
with
open
(
tmp_file_path
,
'wb'
)
as
tmp_pdf_file
:
tmp_pdf_file
.
write
(
pdf_bytes
)
return
tmp_file_path
if
__name__
==
'__main__'
:
example_enable
=
False
with
gr
.
Blocks
()
as
demo
:
gr
.
HTML
(
header
)
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
with
gr
.
Row
():
file
=
gr
.
File
(
label
=
'Please upload a PDF or image'
,
file_types
=
[
'.pdf'
,
'.png'
,
'.jpeg'
,
'.jpg'
])
with
gr
.
Row
():
backend
=
gr
.
Dropdown
([
"pipeline"
,
"vlm-transformers"
,
"vlm-sglang-engine"
,
"vlm-sglang-client"
],
label
=
"Backend"
,
value
=
"pipeline"
)
with
gr
.
Row
():
with
gr
.
Column
():
max_pages
=
gr
.
Slider
(
1
,
20
,
10
,
step
=
1
,
label
=
'Max convert pages'
)
with
gr
.
Row
(
visible
=
True
)
as
ocr_options
:
with
gr
.
Column
():
language
=
gr
.
Dropdown
(
all_lang
,
label
=
'Language'
,
value
=
'ch'
)
with
gr
.
Row
(
visible
=
False
)
as
client_options
:
with
gr
.
Column
():
url
=
gr
.
Textbox
(
label
=
'Server URL'
,
value
=
'http://localhost:30000'
,
placeholder
=
'http://localhost:30000'
)
with
gr
.
Row
(
visible
=
True
)
as
pipeline_options
:
is_ocr
=
gr
.
Checkbox
(
label
=
'Force enable OCR'
,
value
=
False
)
formula_enable
=
gr
.
Checkbox
(
label
=
'Enable formula recognition'
,
value
=
True
)
table_enable
=
gr
.
Checkbox
(
label
=
'Enable table recognition(test)'
,
value
=
True
)
with
gr
.
Row
():
change_bu
=
gr
.
Button
(
'Convert'
)
clear_bu
=
gr
.
ClearButton
(
value
=
'Clear'
)
pdf_show
=
PDF
(
label
=
'PDF preview'
,
interactive
=
False
,
visible
=
True
,
height
=
800
)
if
example_enable
:
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'examples'
)
if
os
.
path
.
exists
(
example_root
):
with
gr
.
Accordion
(
'Examples:'
):
gr
.
Examples
(
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
_
.
endswith
(
'pdf'
)],
inputs
=
file
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
output_file
=
gr
.
File
(
label
=
'convert result'
,
interactive
=
False
)
with
gr
.
Tabs
():
with
gr
.
Tab
(
'Markdown rendering'
):
md
=
gr
.
Markdown
(
label
=
'Markdown rendering'
,
height
=
1100
,
show_copy_button
=
True
,
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
'Markdown text'
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
# 更新界面函数
def
update_interface
(
backend_choice
):
if
backend_choice
in
[
"vlm-transformers"
,
"vlm-sglang-engine"
]:
return
gr
.
update
(
visible
=
False
),
gr
.
update
(
visible
=
False
),
gr
.
update
(
visible
=
False
)
elif
backend_choice
in
[
"vlm-sglang-client"
]:
# pipeline
return
gr
.
update
(
visible
=
True
),
gr
.
update
(
visible
=
False
),
gr
.
update
(
visible
=
False
)
elif
backend_choice
in
[
"pipeline"
]:
return
gr
.
update
(
visible
=
False
),
gr
.
update
(
visible
=
True
),
gr
.
update
(
visible
=
True
)
else
:
pass
# 添加事件处理
backend
.
change
(
fn
=
update_interface
,
inputs
=
[
backend
],
outputs
=
[
client_options
,
ocr_options
,
pipeline_options
]
)
file
.
change
(
fn
=
to_pdf
,
inputs
=
file
,
outputs
=
pdf_show
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
file
,
max_pages
,
is_ocr
,
formula_enable
,
table_enable
,
language
,
backend
,
url
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
file
,
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
])
demo
.
launch
(
server_name
=
'localhost'
)
mineru/resources/header.html
0 → 100644
View file @
34e5d2ff
<html><head>
<link
rel=
"stylesheet"
href=
"https://use.fontawesome.com/releases/v5.15.4/css/all.css"
>
<style>
.link-block
{
border
:
1px
solid
transparent
;
border-radius
:
24px
;
background-color
:
rgba
(
54
,
54
,
54
,
1
);
cursor
:
pointer
!important
;
}
.link-block
:hover
{
background-color
:
rgba
(
54
,
54
,
54
,
0.75
)
!important
;
cursor
:
pointer
!important
;
}
.external-link
{
display
:
inline-flex
;
align-items
:
center
;
height
:
36px
;
line-height
:
36px
;
padding
:
0
16px
;
cursor
:
pointer
!important
;
}
.external-link
,
.external-link
:hover
{
cursor
:
pointer
!important
;
}
a
{
text-decoration
:
none
;
}
</style></head>
<body>
<div
style=
"
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
text-align: center;
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
padding: 24px;
gap: 24px;
border-radius: 8px;
"
>
<div
style=
"
display: flex;
flex-direction: column;
align-items: center;
gap: 16px;
"
>
<div
style=
"display: flex; flex-direction: column; gap: 8px"
>
<h1
style=
"
font-size: 48px;
color: #fafafa;
margin: 0;
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
"
>
MinerU: PDF Extraction Demo
</h1>
</div>
</div>
<p
style=
"
margin: 0;
line-height: 1.6rem;
font-size: 16px;
color: #fafafa;
opacity: 0.8;
"
>
A one-stop, open-source, high-quality data extraction tool, supports
PDF/webpage/e-book extraction.
<br>
</p>
<style>
.link-block
{
display
:
inline-block
;
}
.link-block
+
.link-block
{
margin-left
:
20px
;
}
</style>
<div
class=
"column has-text-centered"
>
<div
class=
"publication-links"
>
<!-- Code Link. -->
<span
class=
"link-block"
>
<a
href=
"https://github.com/opendatalab/MinerU"
class=
"external-link button is-normal is-rounded is-dark"
style=
"text-decoration: none; cursor: pointer"
>
<span
class=
"icon"
style=
"margin-right: 4px"
>
<i
class=
"fab fa-github"
style=
"color: white; margin-right: 4px"
></i>
</span>
<span
style=
"color: white"
>
Code
</span>
</a>
</span>
<!-- arXiv Link. -->
<span
class=
"link-block"
>
<a
href=
"https://arxiv.org/abs/2409.18839"
class=
"external-link button is-normal is-rounded is-dark"
style=
"text-decoration: none; cursor: pointer"
>
<span
class=
"icon"
style=
"margin-right: 8px"
>
<i
class=
"fas fa-file"
style=
"color: white"
></i>
</span>
<span
style=
"color: white"
>
Paper
</span>
</a>
</span>
<!-- Homepage Link. -->
<span
class=
"link-block"
>
<a
href=
"https://mineru.net/home?source=online"
class=
"external-link button is-normal is-rounded is-dark"
style=
"text-decoration: none; cursor: pointer"
>
<span
class=
"icon"
style=
"margin-right: 8px"
>
<i
class=
"fas fa-home"
style=
"color: white"
></i>
</span>
<span
style=
"color: white"
>
Homepage
</span>
</a>
</span>
<!-- Client Link. -->
<span
class=
"link-block"
>
<a
href=
"https://mineru.net/client?source=online"
class=
"external-link button is-normal is-rounded is-dark"
style=
"text-decoration: none; cursor: pointer"
>
<span
class=
"icon"
style=
"margin-right: 8px"
>
<i
class=
"fas fa-download"
style=
"color: white"
></i>
</span>
<span
style=
"color: white"
>
Download
</span>
</a>
</span>
</div>
</div>
<!-- New Demo Links -->
</div>
</body></html>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment