Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
e36627be
Unverified
Commit
e36627be
authored
Oct 23, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 23, 2024
Browse files
Merge pull request #777 from myhloli/add-doclayout-yolo
feat: add support for non-PDF file conversion to PDF
parents
d1c0546a
4834baf4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
28 additions
and
5 deletions
+28
-5
projects/gradio_app/app.py
projects/gradio_app/app.py
+28
-5
No files found.
projects/gradio_app/app.py
View file @
e36627be
...
...
@@ -3,10 +3,12 @@
import
base64
import
os
import
time
import
uuid
import
zipfile
from
pathlib
import
Path
import
re
import
pymupdf
from
loguru
import
logger
from
magic_pdf.libs.hash_utils
import
compute_sha256
...
...
@@ -164,12 +166,32 @@ all_lang = [""]
all_lang
.
extend
([
*
other_lang
,
*
latin_lang
,
*
arabic_lang
,
*
cyrillic_lang
,
*
devanagari_lang
])
def
to_pdf
(
file_path
):
with
pymupdf
.
open
(
file_path
)
as
f
:
if
f
.
is_pdf
:
return
file_path
else
:
pdf_bytes
=
f
.
convert_to_pdf
()
# 将pdfbytes 写入到uuid.pdf中
# 生成唯一的文件名
unique_filename
=
f
"
{
uuid
.
uuid4
()
}
.pdf"
# 构建完整的文件路径
tmp_file_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
file_path
),
unique_filename
)
# 将字节数据写入文件
with
open
(
tmp_file_path
,
'wb'
)
as
tmp_pdf_file
:
tmp_pdf_file
.
write
(
pdf_bytes
)
return
tmp_file_path
if
__name__
==
"__main__"
:
with
gr
.
Blocks
()
as
demo
:
gr
.
HTML
(
header
)
with
gr
.
Row
():
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
pdf_show
=
gr
.
Markdown
(
)
file
=
gr
.
File
(
label
=
"Please upload a PDF or image"
,
file_types
=
[
".pdf"
,
".png"
,
".jpeg"
,
"jpg"
]
)
max_pages
=
gr
.
Slider
(
1
,
10
,
5
,
step
=
1
,
label
=
"Max convert pages"
)
with
gr
.
Row
():
layout_mode
=
gr
.
Dropdown
([
"layoutlmv3"
,
"doclayout_yolo"
],
label
=
"Layout model"
,
value
=
"layoutlmv3"
)
...
...
@@ -180,14 +202,14 @@ if __name__ == "__main__":
table_enable
=
gr
.
Checkbox
(
label
=
"Enable table recognition(test)"
,
value
=
False
)
with
gr
.
Row
():
change_bu
=
gr
.
Button
(
"Convert"
)
clear_bu
=
gr
.
ClearButton
(
[
pdf_show
],
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"P
lease upload pdf
"
,
interactive
=
True
,
height
=
800
)
clear_bu
=
gr
.
ClearButton
(
value
=
"Clear"
)
pdf_show
=
PDF
(
label
=
"P
DF preview
"
,
interactive
=
True
,
height
=
800
)
with
gr
.
Accordion
(
"Examples:"
):
example_root
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"examples"
)
gr
.
Examples
(
examples
=
[
os
.
path
.
join
(
example_root
,
_
)
for
_
in
os
.
listdir
(
example_root
)
if
_
.
endswith
(
"pdf"
)],
inputs
=
pdf_show
,
inputs
=
pdf_show
)
with
gr
.
Column
(
variant
=
'panel'
,
scale
=
5
):
...
...
@@ -198,8 +220,9 @@ if __name__ == "__main__":
latex_delimiters
=
latex_delimiters
,
line_breaks
=
True
)
with
gr
.
Tab
(
"Markdown text"
):
md_text
=
gr
.
TextArea
(
lines
=
45
,
show_copy_button
=
True
)
file
.
upload
(
fn
=
to_pdf
,
inputs
=
file
,
outputs
=
pdf_show
)
change_bu
.
click
(
fn
=
to_markdown
,
inputs
=
[
pdf_show
,
max_pages
,
is_ocr
,
layout_mode
,
formula_enable
,
table_enable
,
language
],
outputs
=
[
md
,
md_text
,
output_file
,
pdf_show
])
clear_bu
.
add
([
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
])
clear_bu
.
add
([
file
,
md
,
pdf_show
,
md_text
,
output_file
,
is_ocr
,
table_enable
,
language
])
demo
.
launch
(
server_name
=
"0.0.0.0"
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment