Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7d27726e
Commit
7d27726e
authored
Jun 11, 2025
by
myhloli
Browse files
refactor: improve file naming logic and enhance unique filename generation
parent
83f7c26f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
5 deletions
+13
-5
projects/gradio_app/app.py
projects/gradio_app/app.py
+13
-5
No files found.
projects/gradio_app/app.py
View file @
7d27726e
...
@@ -20,7 +20,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_e
...
@@ -20,7 +20,7 @@ def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_e
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
try
:
try
:
file_name
=
f
'
{
str
(
Path
(
doc_path
).
stem
)
}
_
{
time
.
time
()
}
'
file_name
=
f
'
{
str
(
Path
(
doc_path
).
stem
)
}
_
{
time
.
strf
time
(
"%y%m%d_%H%M%S"
)
}
'
pdf_data
=
read_fn
(
doc_path
)
pdf_data
=
read_fn
(
doc_path
)
if
is_ocr
:
if
is_ocr
:
parse_method
=
'ocr'
parse_method
=
'ocr'
...
@@ -142,13 +142,21 @@ all_lang = []
...
@@ -142,13 +142,21 @@ all_lang = []
all_lang
.
extend
([
*
other_lang
,
*
add_lang
])
all_lang
.
extend
([
*
other_lang
,
*
add_lang
])
def
safe_stem
(
file_path
):
stem
=
Path
(
file_path
).
stem
# 只保留字母、数字、下划线和点,其他字符替换为下划线
return
re
.
sub
(
r
'[^\w.]'
,
'_'
,
stem
)
def
to_pdf
(
file_path
):
def
to_pdf
(
file_path
):
if
file_path
is
None
:
if
file_path
is
None
:
return
None
return
None
pdf_bytes
=
read_fn
(
file_path
)
pdf_bytes
=
read_fn
(
file_path
)
# 将pdfbytes 写入到uuid.pdf中
#
生成唯一的文件名
#
unique_filename = f'{uuid.uuid4()}.pdf'
unique_filename
=
f
'
{
uuid
.
uuid4
(
)
}
.pdf'
unique_filename
=
f
'
{
safe_stem
(
file_path
)
}
.pdf'
# 构建完整的文件路径
# 构建完整的文件路径
tmp_file_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
file_path
),
unique_filename
)
tmp_file_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
file_path
),
unique_filename
)
...
@@ -168,7 +176,7 @@ if __name__ == '__main__':
...
@@ -168,7 +176,7 @@ if __name__ == '__main__':
with
gr
.
Row
():
with
gr
.
Row
():
file
=
gr
.
File
(
label
=
'Please upload a PDF or image'
,
file_types
=
[
'.pdf'
,
'.png'
,
'.jpeg'
,
'.jpg'
])
file
=
gr
.
File
(
label
=
'Please upload a PDF or image'
,
file_types
=
[
'.pdf'
,
'.png'
,
'.jpeg'
,
'.jpg'
])
with
gr
.
Row
(
equal_height
=
True
):
with
gr
.
Row
(
equal_height
=
True
):
with
gr
.
Column
(
scale
=
3
):
with
gr
.
Column
(
scale
=
4
):
max_pages
=
gr
.
Slider
(
1
,
20
,
10
,
step
=
1
,
label
=
'Max convert pages'
)
max_pages
=
gr
.
Slider
(
1
,
20
,
10
,
step
=
1
,
label
=
'Max convert pages'
)
with
gr
.
Column
(
scale
=
1
):
with
gr
.
Column
(
scale
=
1
):
language
=
gr
.
Dropdown
(
all_lang
,
label
=
'Language'
,
value
=
'ch'
)
language
=
gr
.
Dropdown
(
all_lang
,
label
=
'Language'
,
value
=
'ch'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment