Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
e33ec616
Unverified
Commit
e33ec616
authored
Mar 19, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Mar 19, 2025
Browse files
Merge pull request #1919 from JesseChen1031/jesse
add support for more document types
parents
ecdd162f
2bdb5445
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
94 additions
and
45 deletions
+94
-45
projects/web_api/app.py
projects/web_api/app.py
+70
-45
signatures/version1/cla.json
signatures/version1/cla.json
+24
-0
No files found.
projects/web_api/app.py
View file @
e33ec616
...
...
@@ -3,6 +3,7 @@ import os
from
base64
import
b64encode
from
glob
import
glob
from
io
import
StringIO
import
tempfile
from
typing
import
Tuple
,
Union
import
uvicorn
...
...
@@ -10,11 +11,12 @@ from fastapi import FastAPI, HTTPException, UploadFile
from
fastapi.responses
import
JSONResponse
from
loguru
import
logger
from
magic_pdf.data.read_api
import
read_local_images
,
read_local_office
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
,
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
,
S3DataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.libs.config_reader
import
get_bucket_name
,
get_s3_config
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.operators.models
import
InferenceResult
...
...
@@ -24,6 +26,9 @@ model_config.__use_inside_model__ = True
app
=
FastAPI
()
pdf_extensions
=
[
".pdf"
]
office_extensions
=
[
".ppt"
,
".pptx"
,
".doc"
,
".docx"
]
image_extensions
=
[
".png"
,
".jpg"
]
class
MemoryDataWriter
(
DataWriter
):
def
__init__
(
self
):
...
...
@@ -46,8 +51,8 @@ class MemoryDataWriter(DataWriter):
def
init_writers
(
pdf
_path
:
str
=
None
,
pdf_
file
:
UploadFile
=
None
,
file
_path
:
str
=
None
,
file
:
UploadFile
=
None
,
output_path
:
str
=
None
,
output_image_path
:
str
=
None
,
)
->
Tuple
[
...
...
@@ -59,19 +64,19 @@ def init_writers(
Initialize writers based on path type
Args:
pdf
_path:
PDF
file path (local path or S3 path)
pdf_
file: Uploaded
PDF
file object
file
_path: file path (local path or S3 path)
file: Uploaded file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF
file content
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
"""
if
pdf_path
:
is_s3_path
=
pdf_path
.
startswith
(
"s3://"
)
file_extension
:
str
=
None
if
file_path
:
is_s3_path
=
file_path
.
startswith
(
"s3://"
)
if
is_s3_path
:
bucket
=
get_bucket_name
(
pdf
_path
)
bucket
=
get_bucket_name
(
file
_path
)
ak
,
sk
,
endpoint
=
get_s3_config
(
bucket
)
writer
=
S3DataWriter
(
...
...
@@ -84,25 +89,29 @@ def init_writers(
temp_reader
=
S3DataReader
(
""
,
bucket
=
bucket
,
ak
=
ak
,
sk
=
sk
,
endpoint_url
=
endpoint
)
pdf_bytes
=
temp_reader
.
read
(
pdf_path
)
file_bytes
=
temp_reader
.
read
(
file_path
)
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
with
open
(
pdf_path
,
"rb"
)
as
f
:
pdf_bytes
=
f
.
read
()
with
open
(
file_path
,
"rb"
)
as
f
:
file_bytes
=
f
.
read
()
file_extension
=
os
.
path
.
splitext
(
file_path
)[
1
]
else
:
# 处理上传的文件
pdf_bytes
=
pdf_file
.
file
.
read
()
file_bytes
=
file
.
file
.
read
()
file_extension
=
os
.
path
.
splitext
(
file
.
filename
)[
1
]
writer
=
FileBasedDataWriter
(
output_path
)
image_writer
=
FileBasedDataWriter
(
output_image_path
)
os
.
makedirs
(
output_image_path
,
exist_ok
=
True
)
return
writer
,
image_writer
,
pdf
_bytes
return
writer
,
image_writer
,
file
_bytes
,
file_extension
def
process_pdf
(
pdf_bytes
:
bytes
,
def
process_file
(
file_bytes
:
bytes
,
file_extension
:
str
,
parse_method
:
str
,
image_writer
:
Union
[
S3DataWriter
,
FileBasedDataWriter
],
)
->
Tuple
[
InferenceResult
,
PipeResult
]:
...
...
@@ -110,14 +119,30 @@ def process_pdf(
Process PDF file content
Args:
pdf_bytes: Binary content of PDF file
file_bytes: Binary content of file
file_extension: file extension
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds
=
PymuDocDataset
(
pdf_bytes
)
ds
=
Union
[
PymuDocDataset
,
ImageDataset
]
if
file_extension
in
pdf_extensions
:
ds
=
PymuDocDataset
(
file_bytes
)
elif
file_extension
in
office_extensions
:
# 需要使用office解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_office
(
temp_dir
)[
0
]
elif
file_extension
in
image_extensions
:
# 需要使用ocr解析
temp_dir
=
tempfile
.
mkdtemp
()
with
open
(
os
.
path
.
join
(
temp_dir
,
f
"temp_file.
{
file_extension
}
"
),
"wb"
)
as
f
:
f
.
write
(
file_bytes
)
ds
=
read_local_images
(
temp_dir
)[
0
]
infer_result
:
InferenceResult
=
None
pipe_result
:
PipeResult
=
None
...
...
@@ -145,13 +170,13 @@ def encode_image(image_path: str) -> str:
@
app
.
post
(
"/
pdf
_parse"
,
"/
file
_parse"
,
tags
=
[
"projects"
],
summary
=
"Parse
PDF
files (supports local files and S3)"
,
summary
=
"Parse files (supports local files and S3)"
,
)
async
def
pdf
_parse
(
pdf_
file
:
UploadFile
=
None
,
pdf
_path
:
str
=
None
,
async
def
file
_parse
(
file
:
UploadFile
=
None
,
file
_path
:
str
=
None
,
parse_method
:
str
=
"auto"
,
is_json_md_dump
:
bool
=
False
,
output_dir
:
str
=
"output"
,
...
...
@@ -165,10 +190,10 @@ async def pdf_parse(
to the specified directory.
Args:
pdf_
file: The PDF file to be parsed. Must not be specified together with
`
pdf
_path`
pdf
_path: The path to the PDF file to be parsed. Must not be specified together
with `
pdf_
file`
file: The PDF file to be parsed. Must not be specified together with
`
file
_path`
file
_path: The path to the PDF file to be parsed. Must not be specified together
with `file`
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
results are not satisfactory, try ocr
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
...
...
@@ -181,31 +206,31 @@ async def pdf_parse(
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try
:
if
(
pdf_
file
is
None
and
pdf
_path
is
None
)
or
(
pdf_
file
is
not
None
and
pdf
_path
is
not
None
if
(
file
is
None
and
file
_path
is
None
)
or
(
file
is
not
None
and
file
_path
is
not
None
):
return
JSONResponse
(
content
=
{
"error"
:
"Must provide either
pdf_
file or
pdf
_path"
},
content
=
{
"error"
:
"Must provide either file or
file
_path"
},
status_code
=
400
,
)
# Get PDF filename
pdf
_name
=
os
.
path
.
basename
(
pdf
_path
if
pdf
_path
else
pdf_
file
.
filename
).
split
(
file
_name
=
os
.
path
.
basename
(
file
_path
if
file
_path
else
file
.
filename
).
split
(
"."
)[
0
]
output_path
=
f
"
{
output_dir
}
/
{
pdf
_name
}
"
output_path
=
f
"
{
output_dir
}
/
{
file
_name
}
"
output_image_path
=
f
"
{
output_path
}
/images"
# Initialize readers/writers and get PDF content
writer
,
image_writer
,
pdf
_bytes
=
init_writers
(
pdf
_path
=
pdf
_path
,
pdf_
file
=
pdf_
file
,
writer
,
image_writer
,
file
_bytes
,
file_extension
=
init_writers
(
file
_path
=
file
_path
,
file
=
file
,
output_path
=
output_path
,
output_image_path
=
output_image_path
,
)
# Process PDF
infer_result
,
pipe_result
=
process_
pdf
(
pdf_bytes
,
parse_method
,
image_writer
)
infer_result
,
pipe_result
=
process_
file
(
file_bytes
,
file_extension
,
parse_method
,
image_writer
)
# Use MemoryDataWriter to get results
content_list_writer
=
MemoryDataWriter
()
...
...
@@ -226,23 +251,23 @@ async def pdf_parse(
# If results need to be saved
if
is_json_md_dump
:
writer
.
write_string
(
f
"
{
pdf
_name
}
_content_list.json"
,
content_list_writer
.
get_value
()
f
"
{
file
_name
}
_content_list.json"
,
content_list_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
pdf
_name
}
.md"
,
md_content
)
writer
.
write_string
(
f
"
{
file
_name
}
.md"
,
md_content
)
writer
.
write_string
(
f
"
{
pdf
_name
}
_middle.json"
,
middle_json_writer
.
get_value
()
f
"
{
file
_name
}
_middle.json"
,
middle_json_writer
.
get_value
()
)
writer
.
write_string
(
f
"
{
pdf
_name
}
_model.json"
,
f
"
{
file
_name
}
_model.json"
,
json
.
dumps
(
model_json
,
indent
=
4
,
ensure_ascii
=
False
),
)
# Save visualization results
pipe_result
.
draw_layout
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_layout.pdf"
))
pipe_result
.
draw_span
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_spans.pdf"
))
pipe_result
.
draw_layout
(
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_layout.pdf"
))
pipe_result
.
draw_span
(
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_spans.pdf"
))
pipe_result
.
draw_line_sort
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_line_sort.pdf"
)
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_line_sort.pdf"
)
)
infer_result
.
draw_model
(
os
.
path
.
join
(
output_path
,
f
"
{
pdf
_name
}
_model.pdf"
))
infer_result
.
draw_model
(
os
.
path
.
join
(
output_path
,
f
"
{
file
_name
}
_model.pdf"
))
# Build return data
data
=
{}
...
...
signatures/version1/cla.json
View file @
e33ec616
...
...
@@ -183,6 +183,30 @@
"created_at"
:
"2025-02-26T09:23:25Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1785
},
{
"name"
:
"rschutski"
,
"id"
:
179498169
,
"comment_id"
:
2705150371
,
"created_at"
:
"2025-03-06T23:16:30Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1863
},
{
"name"
:
"qbit-"
,
"id"
:
4794088
,
"comment_id"
:
2705914730
,
"created_at"
:
"2025-03-07T09:09:13Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1863
},
{
"name"
:
"mauryaland"
,
"id"
:
22381129
,
"comment_id"
:
2717322316
,
"created_at"
:
"2025-03-12T10:03:11Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1906
}
]
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment