Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
102fe277
Commit
102fe277
authored
Mar 14, 2025
by
JesseChen1031
Browse files
add support for more document types
parent
ce67ccf8
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
59 additions
and
34 deletions
+59
-34
projects/web_api/app.py
projects/web_api/app.py
+59
-34
No files found.
projects/web_api/app.py
View file @
102fe277
...
...
@@ -3,6 +3,7 @@ import os
from
base64
import
b64encode
from
glob
import
glob
from
io
import
StringIO
import
tempfile
from
typing
import
Tuple
,
Union
import
uvicorn
...
...
@@ -10,11 +11,12 @@ from fastapi import FastAPI, HTTPException, UploadFile
from
fastapi.responses
import
JSONResponse
from
loguru
import
logger
from
magic_pdf.data.read_api
import
read_local_images
,
read_local_office
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
,
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
,
S3DataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.libs.config_reader
import
get_bucket_name
,
get_s3_config
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.operators.models
import
InferenceResult
...
...
@@ -24,6 +26,9 @@ model_config.__use_inside_model__ = True
app
=
FastAPI
()
pdf_extensions
=
[
".pdf"
]
office_extensions
=
[
".ppt"
,
".pptx"
,
".doc"
,
".docx"
]
image_extensions
=
[
".png"
,
".jpg"
]
class
MemoryDataWriter
(
DataWriter
):
def
__init__
(
self
):
...
...
@@ -46,8 +51,8 @@ class MemoryDataWriter(DataWriter):
def
init_writers
(
pdf
_path
:
str
=
None
,
pdf_
file
:
UploadFile
=
None
,
file
_path
:
str
=
None
,
file
:
UploadFile
=
None
,
output_path
:
str
=
None
,
output_image_path
:
str
=
None
,
)
->
Tuple
[
...
...
@@ -68,10 +73,11 @@ def init_writers(
Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF
file content
"""
if
pdf_path
:
is_s3_path
=
pdf_path
.
startswith
(
"s3://"
)
file_extension
:
str
=
None
if
file_path
:
is_s3_path
=
file_path
.
startswith
(
"s3://"
)
if
is_s3_path
:
bucket
=
get_bucket_name
(
pdf
_path
)
bucket
=
get_bucket_name
(
file
_path
)
ak
,
sk
,
endpoint
=
get_s3_config
(
bucket
)
writer
=
S3DataWriter
(
...
...
@@ -84,25 +90,29 @@ def init_writers(
temp_reader
=
S3DataReader
(
""
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
pdf_bytes
=
temp_reader
.
read
(
pdf_path
)
file_bytes
=
temp_reader
.
read
(
file_path
)
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
with
open
(
pdf_path
,
"rb"
)
as
f
:
pdf_bytes
=
f
.
read
()
with
open
(
file_path
,
"rb"
)
as
f
:
file_bytes
=
f
.
read
()
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
# 处理上传的文件
pdf_bytes
=
pdf_file
.
file
.
read
()
file_bytes
=
file
.
file
.
read
()
file_extension
=
os
.
path
.
splitext
(
file
.
filename
)[
1
]
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
return
writer
,
image_writer
,
pdf
_bytes
return
writer
,
image_writer
,
file
_bytes
,
file_extension
def
process_pdf
(
pdf_bytes
:
bytes
,
def
process_file
(
file_bytes
:
bytes
,
file_extension
:
str
,
parse_method
:
str
,
image_writer
:
Union
[
S3DataWriter
,
FileBasedDataWriter
],
)
->
Tuple
[
InferenceResult
,
PipeResult
]:
...
...
@@ -117,7 +127,22 @@ def process_pdf(
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds
=
PymuDocDataset
(
pdf_bytes
)
ds
=
Union
[
PymuDocDataset
,
ImageDataset
]
if
file_extension
in
pdf_extensions
:
ds
=
PymuDocDataset
(
file_bytes
)
elif
file_extension
in
office_extensions
:
# 需要使用office解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_office
(
temp_dir
)[
0
]
elif
file_extension
in
image_extensions
:
# 需要使用ocr解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_images
(
temp_dir
)[
0
]
infer_result
:
InferenceResult
=
None
pipe_result
:
PipeResult
=
None
...
...
@@ -149,9 +174,9 @@ def encode_image(image_path: str) -> str:
tags
=
[
"projects"
],
summary
=
"Parse PDF files (supports local files and S3)"
,
)
async
def
pdf
_parse
(
pdf_
file
:
UploadFile
=
None
,
pdf
_path
:
str
=
None
,
async
def
file
_parse
(
file
:
UploadFile
=
None
,
file
_path
:
str
=
None
,
parse_method
:
str
=
"auto"
,
is_json_md_dump
:
bool
=
False
,
output_dir
:
str
=
"output"
,
...
...
@@ -181,31 +206,31 @@ async def pdf_parse(
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try
:
if
(
pdf_
file
is
None
and
pdf
_path
is
None
)
or
(
pdf_
file
is
not
None
and
pdf
_path
is
not
None
if
(
file
is
None
and
file
_path
is
None
)
or
(
file
is
not
None
and
file
_path
is
not
None
):
return
JSONResponse
(
content
=
{
"error"
:
"Must provide either
pdf_
file or
pdf
_path"
},
content
=
{
"error"
:
"Must provide either file or
file
_path"
},
status_code
=
400
,
)
# Get PDF filename
pdf
_name
=
os
.
path
.
basename
(
pdf
_path
if
pdf
_path
else
pdf_
file
.
filename
).
split
(
file
_name
=
os
.
path
.
basename
(
file
_path
if
file
_path
else
file
.
filename
).
split
(
"."
)[
0
]
output_path
=
f
"
{
output_dir
}
/
{
pdf
_name
}
"
output_path
=
f
"
{
output_dir
}
/
{
file
_name
}
"
output_image_path
=
f
"
{
output_path
}
/images"
# Initialize readers/writers and get PDF content
writer
,
image_writer
,
pdf
_bytes
=
init_writers
(
pdf
_path
=
pdf
_path
,
pdf_
file
=
pdf_
file
,
writer
,
image_writer
,
file
_bytes
,
file_extension
=
init_writers
(
file
_path
=
file
_path
,
file
=
file
,
output_path
=
output_path
,
output_image_path
=
output_image_path
,
)
# Process PDF
infer_result
,
pipe_result
=
process_
pdf
(
pdf_bytes
,
parse_method
,
image_writer
)
infer_result
,
pipe_result
=
process_
file
(
file_bytes
,
file_extension
,
parse_method
,
image_writer
)
# Use MemoryDataWriter to get results
content_list_writer
=
MemoryDataWriter
()
...
...
@@ -226,23 +251,23 @@ async def pdf_parse(
# If results need to be saved
if
is_json_md_dump
:
writer
.
write_string
(
f
"
{
pdf
_name
}
_content_list.json"
,
content_list_writer
.
get_value
()
f
"
{
file
_name
}
_content_list.json"
,
content_list_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
pdf
_name
}
.md"
,
md_content
)
writer
.
write_string
(
f
"
{
file
_name
}
.md"
,
md_content
)
writer
.
write_string
(
f
"
{
pdf
_name
}
_middle.json"
,
middle_json_writer
.
get_value
()
f
"
{
file
_name
}
_middle.json"
,
middle_json_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
pdf
_name
}
_model.json"
,
f
"
{
file
_name
}
_model.json"
,
json
.
dumps
(
model_json
,
indent
=
4
,
ensure_ascii
=
False
),
)
# Save visualization results
pipe_result
.
draw_layout
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_layout.pdf"
))
pipe_result
.
draw_span
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_spans.pdf"
))
pipe_result
.
draw_layout
(
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_layout.pdf"
))
pipe_result
.
draw_span
(
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_spans.pdf"
))
pipe_result
.
draw_line_sort
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_line_sort.pdf"
)
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_line_sort.pdf"
)
)
infer_result
.
draw_model
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_model.pdf"
))
infer_result
.
draw_model
(
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_model.pdf"
))
# Build return data
data
=
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment