Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
02898cdd
Commit
02898cdd
authored
Jun 11, 2025
by
myhloli
Browse files
refactor: simplify file reading function and improve input validation
parent
7eed5ee9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
26 deletions
+12
-26
mineru/cli/common.py
mineru/cli/common.py
+3
-1
projects/gradio_app/app.py
projects/gradio_app/app.py
+9
-25
No files found.
mineru/cli/common.py
View file @
02898cdd
...
...
@@ -23,7 +23,9 @@ pdf_suffixes = [".pdf"]
image_suffixes
=
[
".png"
,
".jpeg"
,
".jpg"
]
def
read_fn
(
path
:
Path
):
def
read_fn
(
path
):
if
not
isinstance
(
path
,
Path
):
path
=
Path
(
path
)
with
open
(
str
(
path
),
"rb"
)
as
input_file
:
file_bytes
=
input_file
.
read
()
if
path
.
suffix
in
image_suffixes
:
...
...
projects/gradio_app/app.py
View file @
02898cdd
...
...
@@ -12,16 +12,10 @@ import gradio as gr
from
gradio_pdf
import
PDF
from
loguru
import
logger
from
mineru.cli.common
import
prepare_env
,
do_parse
from
mineru.data.data_reader_writer
import
FileBasedDataReader
from
mineru.cli.common
import
prepare_env
,
do_parse
,
read_fn
from
mineru.utils.hash_utils
import
str_sha256
def
read_fn
(
path
):
disk_rw
=
FileBasedDataReader
(
os
.
path
.
dirname
(
path
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
))
def
parse_pdf
(
doc_path
,
output_dir
,
end_page_id
,
is_ocr
,
formula_enable
,
table_enable
,
language
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
...
...
@@ -120,19 +114,6 @@ latex_delimiters = [
]
def
init_model
():
try
:
pass
return
0
except
Exception
as
e
:
logger
.
exception
(
e
)
return
-
1
model_init
=
init_model
()
logger
.
info
(
f
'model_init:
{
model_init
}
'
)
with
open
(
'header.html'
,
'r'
)
as
file
:
header
=
file
.
read
()
...
...
@@ -162,6 +143,8 @@ all_lang.extend([*other_lang, *add_lang])
def
to_pdf
(
file_path
):
if
file_path
is
None
:
return
None
pdf_bytes
=
read_fn
(
file_path
)
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
...
...
@@ -182,14 +165,15 @@ if __name__ == '__main__':
gr
.
HTML
(
header
)
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
file
=
gr
.
File
(
label
=
'Please upload a PDF or image'
,
file_types
=
[
'.pdf'
,
'.png'
,
'.jpeg'
,
'.jpg'
])
max_pages
=
gr
.
Slider
(
1
,
20
,
10
,
step
=
1
,
label
=
'Max convert pages'
)
with
gr
.
Row
():
with
gr
.
Column
():
is_ocr
=
gr
.
Checkbox
(
label
=
'Force enable OCR'
,
value
=
False
)
with
gr
.
Column
():
file
=
gr
.
File
(
label
=
'Please upload a PDF or image'
,
file_types
=
[
'.pdf'
,
'.png'
,
'.jpeg'
,
'.jpg'
])
with
gr
.
Row
(
equal_height
=
True
):
with
gr
.
Column
(
scale
=
3
):
max_pages
=
gr
.
Slider
(
1
,
20
,
10
,
step
=
1
,
label
=
'Max convert pages'
)
with
gr
.
Column
(
scale
=
1
):
language
=
gr
.
Dropdown
(
all_lang
,
label
=
'Language'
,
value
=
'ch'
)
with
gr
.
Row
():
is_ocr
=
gr
.
Checkbox
(
label
=
'Force enable OCR'
,
value
=
False
)
formula_enable
=
gr
.
Checkbox
(
label
=
'Enable formula recognition'
,
value
=
True
)
table_enable
=
gr
.
Checkbox
(
label
=
'Enable table recognition(test)'
,
value
=
True
)
with
gr
.
Row
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment