Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
440fd0c7
Commit
440fd0c7
authored
Dec 12, 2024
by
icecraft
Browse files
fix: projects
parent
6750b53d
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
214 additions
and
283 deletions
+214
-283
demo/demo.py
demo/demo.py
+0
-27
demo/magic_pdf_parse_main.py
demo/magic_pdf_parse_main.py
+0
-147
magic_pdf/pipe/operators.py
magic_pdf/pipe/operators.py
+55
-13
next_docs/en/user_guide/quick_start/convert_doc.rst
next_docs/en/user_guide/quick_start/convert_doc.rst
+10
-7
next_docs/en/user_guide/quick_start/convert_docx.rst
next_docs/en/user_guide/quick_start/convert_docx.rst
+10
-5
next_docs/en/user_guide/quick_start/convert_image.rst
next_docs/en/user_guide/quick_start/convert_image.rst
+8
-2
next_docs/en/user_guide/quick_start/convert_pdf.rst
next_docs/en/user_guide/quick_start/convert_pdf.rst
+10
-4
next_docs/en/user_guide/quick_start/convert_ppt.rst
next_docs/en/user_guide/quick_start/convert_ppt.rst
+12
-6
next_docs/en/user_guide/quick_start/convert_pptx.rst
next_docs/en/user_guide/quick_start/convert_pptx.rst
+11
-5
next_docs/en/user_guide/usage/api.rst
next_docs/en/user_guide/usage/api.rst
+42
-22
projects/web_api/app.py
projects/web_api/app.py
+33
-27
projects/web_demo/web_demo/api/analysis/pdf_ext.py
projects/web_demo/web_demo/api/analysis/pdf_ext.py
+23
-18
No files found.
demo/demo.py
deleted
100644 → 0
View file @
6750b53d
import
os
from
loguru
import
logger
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
try
:
current_script_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
demo_name
=
'demo1'
pdf_path
=
os
.
path
.
join
(
current_script_dir
,
f
'
{
demo_name
}
.pdf'
)
pdf_bytes
=
open
(
pdf_path
,
'rb'
).
read
()
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
[]}
local_image_dir
=
os
.
path
.
join
(
current_script_dir
,
'images'
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
image_writer
=
FileBasedDataWriter
(
local_image_dir
)
pipe
=
UNIPipe
(
PymuDocDataset
(
pdf_bytes
),
jso_useful_key
,
image_writer
)
pipe
.
pipe_classify
()
pipe
.
pipe_analyze
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
'none'
)
with
open
(
f
'
{
demo_name
}
.md'
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
md_content
)
except
Exception
as
e
:
logger
.
exception
(
e
)
demo/magic_pdf_parse_main.py
deleted
100644 → 0
View file @
6750b53d
import
copy
import
json
import
os
from
loguru
import
logger
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
# todo: 设备类型选择 (?)
def
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
,
orig_model_list
,
):
# 写入模型结果到 model.json
md_writer
.
write_string
(
f
'
{
pdf_name
}
_model.json'
,
json
.
dumps
(
orig_model_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
# 写入中间结果到 middle.json
md_writer
.
write_string
(
f
'
{
pdf_name
}
_middle.json'
,
json
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
)
)
# text文本结果写入到 conent_list.json
md_writer
.
write_string
(
f
'
{
pdf_name
}
_content_list.json'
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
# 写入结果到 .md 文件中
md_writer
.
write_string
(
f
'
{
pdf_name
}
.md'
,
md_content
,
)
# 可视化
def
draw_visualization_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
):
# 画布局框,附带排序结果
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
)
# 画 span 框
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
)
def
pdf_parse_main
(
pdf_path
:
str
,
parse_method
:
str
=
'auto'
,
model_json_path
:
str
=
None
,
is_json_md_dump
:
bool
=
True
,
is_draw_visualization_bbox
:
bool
=
True
,
output_dir
:
str
=
None
):
"""执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录.
:param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr
:param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中
:param is_draw_visualization_bbox: 是否绘制可视化边界框,默认 True,会生成布局框和 span 框的图像
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果
"""
try
:
pdf_name
=
os
.
path
.
basename
(
pdf_path
).
split
(
'.'
)[
0
]
pdf_path_parent
=
os
.
path
.
dirname
(
pdf_path
)
if
output_dir
:
output_path
=
os
.
path
.
join
(
output_dir
,
pdf_name
)
else
:
output_path
=
os
.
path
.
join
(
pdf_path_parent
,
pdf_name
)
output_image_path
=
os
.
path
.
join
(
output_path
,
'images'
)
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent
=
os
.
path
.
basename
(
output_image_path
)
pdf_bytes
=
open
(
pdf_path
,
'rb'
).
read
()
# 读取 pdf 文件的二进制数据
orig_model_list
=
[]
if
model_json_path
:
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型
model_json
=
json
.
loads
(
open
(
model_json_path
,
'r'
,
encoding
=
'utf-8'
).
read
())
orig_model_list
=
copy
.
deepcopy
(
model_json
)
else
:
model_json
=
[]
# 执行解析步骤
image_writer
,
md_writer
=
FileBasedDataWriter
(
output_image_path
),
FileBasedDataWriter
(
output_path
)
# 选择解析方式
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_json
}
pipe
=
UNIPipe
(
PymuDocDataset
(
pdf_bytes
),
jso_useful_key
,
image_writer
)
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
PymuDocDataset
(
pdf_bytes
),
model_json
,
image_writer
)
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
PymuDocDataset
(
pdf_bytes
),
model_json
,
image_writer
)
else
:
logger
.
error
(
'unknown parse method, only auto, ocr, txt allowed'
)
exit
(
1
)
# 执行分类
pipe
.
pipe_classify
()
# 如果没有传入模型数据,则使用内置模型解析
if
len
(
model_json
)
==
0
:
pipe
.
pipe_analyze
()
# 解析
orig_model_list
=
copy
.
deepcopy
(
pipe
.
model_list
)
# 执行解析
pipe
.
pipe_parse
()
# 保存 text 和 md 格式的结果
content_list
=
pipe
.
pipe_mk_uni_format
(
image_path_parent
,
drop_mode
=
'none'
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_path_parent
,
drop_mode
=
'none'
)
if
is_json_md_dump
:
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
,
orig_model_list
)
if
is_draw_visualization_bbox
:
draw_visualization_bbox
(
pipe
.
pdf_mid_data
[
'pdf_info'
],
pdf_bytes
,
output_path
,
pdf_name
)
except
Exception
as
e
:
logger
.
exception
(
e
)
# 测试
if
__name__
==
'__main__'
:
current_script_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
demo_names
=
[
'demo1'
,
'demo2'
,
'small_ocr'
]
for
name
in
demo_names
:
file_path
=
os
.
path
.
join
(
current_script_dir
,
f
'
{
name
}
.pdf'
)
pdf_parse_main
(
file_path
)
magic_pdf/pipe/operators.py
View file @
440fd0c7
import
copy
import
json
import
json
import
os
import
os
from
typing
import
Callable
from
typing
import
Callable
import
copy
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.data_reader_writer
import
DataWriter
...
@@ -23,6 +23,26 @@ class PipeResult:
...
@@ -23,6 +23,26 @@ class PipeResult:
self
.
_pipe_res
=
pipe_res
self
.
_pipe_res
=
pipe_res
self
.
_dataset
=
dataset
self
.
_dataset
=
dataset
def
get_markdown
(
self
,
img_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
,
md_make_mode
=
MakeMode
.
MM_MD
)
->
str
:
"""Get markdown content.
Args:
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
Returns:
str: return markdown content
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_dir_or_bucket_prefix
)
return
md_content
def
dump_md
(
def
dump_md
(
self
,
self
,
writer
:
DataWriter
,
writer
:
DataWriter
,
...
@@ -40,14 +60,40 @@ class PipeResult:
...
@@ -40,14 +60,40 @@ class PipeResult:
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""
"""
md_content
=
self
.
get_markdown
(
img_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
writer
.
write_string
(
file_path
,
md_content
)
def
get_content_list
(
self
,
image_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
MakeMode
.
STANDARD_FORMAT
)
->
str
:
"""Get Content List.
Args:
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
Returns:
str: content list content
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
md_content
=
union_make
(
content_list
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_dir_or_bucket_prefix
pdf_info_list
,
md_make_mode
,
drop_mode
,
image_dir_or_bucket_prefix
,
)
)
writer
.
write_string
(
file_path
,
md_
content
)
return
content
_list
def
dump_content_list
(
def
dump_content_list
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
image_dir_or_bucket_prefix
:
str
self
,
writer
:
DataWriter
,
file_path
:
str
,
image_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
MakeMode
.
STANDARD_FORMAT
):
):
"""Dump Content List.
"""Dump Content List.
...
@@ -55,14 +101,10 @@ class PipeResult:
...
@@ -55,14 +101,10 @@ class PipeResult:
writer (DataWriter): File writer handle
writer (DataWriter): File writer handle
file_path (str): The file location of content list
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
"""
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
content_list
=
self
.
get_content_list
(
image_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
DropMode
.
NONE
,
image_dir_or_bucket_prefix
,
)
writer
.
write_string
(
writer
.
write_string
(
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
)
...
@@ -123,7 +165,7 @@ class PipeResult:
...
@@ -123,7 +165,7 @@ class PipeResult:
Returns:
Returns:
str: compress the pipeline result and return
str: compress the pipeline result and return
"""
"""
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
return
JsonCompressor
.
compress_json
(
self
.
_pipe_res
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
"""Apply callable method which.
...
...
next_docs/en/user_guide/quick_start/convert_doc.rst
View file @
440fd0c7
...
@@ -48,9 +48,12 @@ API
...
@@ -48,9 +48,12 @@ API
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
next_docs/en/user_guide/quick_start/convert_docx.rst
View file @
440fd0c7
...
@@ -47,7 +47,12 @@ API
...
@@ -47,7 +47,12 @@ API
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
next_docs/en/user_guide/quick_start/convert_image.rst
View file @
440fd0c7
...
@@ -41,6 +41,12 @@ API
...
@@ -41,6 +41,12 @@ API
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds = read_local_images(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
next_docs/en/user_guide/quick_start/convert_pdf.rst
View file @
440fd0c7
...
@@ -44,6 +44,12 @@ API
...
@@ -44,6 +44,12 @@ API
## Create Dataset Instance
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir
)
next_docs/en/user_guide/quick_start/convert_ppt.rst
View file @
440fd0c7
...
@@ -47,6 +47,12 @@ API
...
@@ -47,6 +47,12 @@ API
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
next_docs/en/user_guide/quick_start/convert_pptx.rst
View file @
440fd0c7
...
@@ -50,6 +50,12 @@ API
...
@@ -50,6 +50,12 @@ API
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds = read_local_office(input_file)[0]
# ocr mode
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
# txt mode
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
next_docs/en/user_guide/usage/api.rst
View file @
440fd0c7
...
@@ -16,6 +16,7 @@ Local File Example
...
@@ -16,6 +16,7 @@ Local File Example
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
pdf_file_name = "abc.pdf" # replace with the real pdf path
...
@@ -41,14 +42,21 @@ Local File Example
...
@@ -41,14 +42,21 @@ Local File Example
ds = PymuDocDataset(pdf_bytes)
ds = PymuDocDataset(pdf_bytes)
## inference
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
infer_result = ds.apply(doc_analyze, ocr=True)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
## pipeline
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### draw layout result on each page
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...
@@ -58,6 +66,9 @@ Local File Example
...
@@ -58,6 +66,9 @@ Local File Example
### dump markdown
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
S3 File Example
S3 File Example
^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^
...
@@ -97,23 +108,32 @@ S3 File Example
...
@@ -97,23 +108,32 @@ S3 File Example
ds = PymuDocDataset(pdf_bytes)
ds = PymuDocDataset(pdf_bytes)
## inference
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
infer_result = ds.apply(doc_analyze, ocr=True)
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
## pipeline
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### draw layout result on each page
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f
'
{name_without_suff}_layout.pdf
'
))
# dump to local
pipe_result.draw_layout(os.path.join(local_
md_
dir, f
"
{name_without_suff}_layout.pdf
"
))
### draw spans result on each page
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f
'
{name_without_suff}_spans.pdf
'
))
# dump to local
pipe_result.draw_span(os.path.join(local_
md_
dir, f
"
{name_without_suff}_spans.pdf
"
))
### dump markdown
### dump markdown
pipe_result.dump_md(writer, f
'
{name_without_suff}.md
'
,
"unittest/tmp/images") # dump to remote s3
pipe_result.dump_md(
md_
writer, f
"
{name_without_suff}.md
"
,
image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
MS-Office
MS-Office
...
@@ -144,7 +164,7 @@ MS-Office
...
@@ -144,7 +164,7 @@ MS-Office
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_
ocr
_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_
txt
_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
...
@@ -182,7 +202,7 @@ Single Image File
...
@@ -182,7 +202,7 @@ Single Image File
input_file_name = input_file.split(".")[0]
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds = read_local_images(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_
ocr
_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_
txt
_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
md_writer, f"{input_file_name}.md", image_dir
)
)
...
@@ -217,7 +237,7 @@ Directory That Contains Images
...
@@ -217,7 +237,7 @@ Directory That Contains Images
count = 0
count = 0
for ds in dss:
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_
ocr
_mode(image_writer).dump_md(
ds.apply(doc_analyze, ocr=True).pipe_
txt
_mode(image_writer).dump_md(
md_writer, f"{count}.md", image_dir
md_writer, f"{count}.md", image_dir
)
)
count += 1
count += 1
...
...
projects/web_api/app.py
View file @
440fd0c7
...
@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse
...
@@ -9,10 +9,11 @@ from fastapi.responses import JSONResponse
from
loguru
import
logger
from
loguru
import
logger
import
magic_pdf.model
as
model_config
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.
pipe.OCRPipe
import
OCRPipe
from
magic_pdf.
data.dataset
import
PymuDocDataset
from
magic_pdf.
pipe.TXTPipe
import
TXTPip
e
from
magic_pdf.
model.doc_analyze_by_custom_model
import
doc_analyz
e
from
magic_pdf.
pipe.UNIPipe
import
UNIPipe
from
magic_pdf.
model.operators
import
InferenceResult
model_config
.
__use_inside_model__
=
True
model_config
.
__use_inside_model__
=
True
...
@@ -20,14 +21,15 @@ app = FastAPI()
...
@@ -20,14 +21,15 @@ app = FastAPI()
def
json_md_dump
(
def
json_md_dump
(
pipe
,
model_json
,
middle_json
,
md_writer
,
md_writer
,
pdf_name
,
pdf_name
,
content_list
,
content_list
,
md_content
,
md_content
,
):
):
# Write model results to model.json
# Write model results to model.json
orig_model_list
=
copy
.
deepcopy
(
pipe
.
model_
list
)
orig_model_list
=
copy
.
deepcopy
(
model_
json
)
md_writer
.
write_string
(
md_writer
.
write_string
(
f
'
{
pdf_name
}
_model.json'
,
f
'
{
pdf_name
}
_model.json'
,
json
.
dumps
(
orig_model_list
,
ensure_ascii
=
False
,
indent
=
4
),
json
.
dumps
(
orig_model_list
,
ensure_ascii
=
False
,
indent
=
4
),
...
@@ -36,7 +38,7 @@ def json_md_dump(
...
@@ -36,7 +38,7 @@ def json_md_dump(
# Write intermediate results to middle.json
# Write intermediate results to middle.json
md_writer
.
write_string
(
md_writer
.
write_string
(
f
'
{
pdf_name
}
_middle.json'
,
f
'
{
pdf_name
}
_middle.json'
,
json
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
),
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
)
# Write text content results to content_list.json
# Write text content results to content_list.json
...
@@ -100,45 +102,49 @@ async def pdf_parse_main(
...
@@ -100,45 +102,49 @@ async def pdf_parse_main(
output_image_path
output_image_path
),
FileBasedDataWriter
(
output_path
)
),
FileBasedDataWriter
(
output_path
)
ds
=
PymuDocDataset
(
pdf_bytes
)
# Choose parsing method
# Choose parsing method
if
parse_method
==
'auto'
:
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_json
}
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
)
parse_method
=
'ocr'
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_json
,
image_writer
)
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_json
,
image_writer
)
else
:
else
:
parse_method
=
'txt'
if
parse_method
not
in
[
'txt'
,
'ocr'
]:
logger
.
error
(
'Unknown parse method, only auto, ocr, txt allowed'
)
logger
.
error
(
'Unknown parse method, only auto, ocr, txt allowed'
)
return
JSONResponse
(
return
JSONResponse
(
content
=
{
'error'
:
'Invalid parse method'
},
status_code
=
400
content
=
{
'error'
:
'Invalid parse method'
},
status_code
=
400
)
)
# Execute classification
if
len
(
model_json
)
==
0
:
pipe
.
pipe_classify
()
if
parse_method
==
'ocr'
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
# If no model data is provided, use built-in model for parsing
if
not
model_json
:
if
model_config
.
__use_inside_model__
:
pipe
.
pipe_analyze
()
# Parse
else
:
else
:
infer_result
=
InferenceResult
(
model_json
,
ds
)
if
len
(
model_json
)
==
0
and
not
model_config
.
__use_inside_model__
:
logger
.
error
(
'Need model list input'
)
logger
.
error
(
'Need model list input'
)
return
JSONResponse
(
return
JSONResponse
(
content
=
{
'error'
:
'Model list input required'
},
status_code
=
400
content
=
{
'error'
:
'Model list input required'
},
status_code
=
400
)
)
if
parse_method
==
'ocr'
:
pipe_res
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
pipe_res
=
infer_result
.
pipe_txt_mode
(
image_writer
)
# Execute parsing
pipe
.
pipe_parse
()
# Save results in text and md format
# Save results in text and md format
content_list
=
pipe
.
pipe_mk_uni_forma
t
(
image_path_parent
,
drop_mode
=
'none'
)
content_list
=
pipe
_res
.
get_content_lis
t
(
image_path_parent
,
drop_mode
=
'none'
)
md_content
=
pipe
.
pipe_mk
_markdown
(
image_path_parent
,
drop_mode
=
'none'
)
md_content
=
pipe
_res
.
get
_markdown
(
image_path_parent
,
drop_mode
=
'none'
)
if
is_json_md_dump
:
if
is_json_md_dump
:
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
)
json_md_dump
(
infer_result
.
_infer_res
,
pipe_res
.
_pipe_res
,
md_writer
,
pdf_name
,
content_list
,
md_content
)
data
=
{
data
=
{
'layout'
:
copy
.
deepcopy
(
pipe
.
model_list
),
'layout'
:
copy
.
deepcopy
(
infer_result
.
_infer_res
),
'info'
:
pipe
.
pdf_mid_data
,
'info'
:
pipe
_res
.
_pipe_res
,
'content_list'
:
content_list
,
'content_list'
:
content_list
,
'md_content'
:
md_content
,
'md_content'
:
md_content
,
}
}
...
...
projects/web_demo/web_demo/api/analysis/pdf_ext.py
View file @
440fd0c7
...
@@ -11,9 +11,12 @@ from flask import current_app, url_for
...
@@ -11,9 +11,12 @@ from flask import current_app, url_for
from
loguru
import
logger
from
loguru
import
logger
import
magic_pdf.model
as
model_config
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.model.operators
import
InferenceResult
from
..extentions
import
app
,
db
from
..extentions
import
app
,
db
from
.ext
import
find_file
from
.ext
import
find_file
...
@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True
...
@@ -25,25 +28,28 @@ model_config.__use_inside_model__ = True
def
analysis_pdf
(
image_url_prefix
,
image_dir
,
pdf_bytes
,
is_ocr
=
False
):
def
analysis_pdf
(
image_url_prefix
,
image_dir
,
pdf_bytes
,
is_ocr
=
False
):
try
:
try
:
model_json
=
[]
# model_json传空list使用内置模型解析
model_json
=
[]
# model_json传空list使用内置模型解析
image_writer
=
FileBasedDataWriter
(
image_dir
)
logger
.
info
(
f
'is_ocr:
{
is_ocr
}
'
)
logger
.
info
(
f
'is_ocr:
{
is_ocr
}
'
)
parse_method
=
'ocr'
ds
=
PymuDocDataset
(
pdf_bytes
)
# Choose parsing method
if
not
is_ocr
:
if
not
is_ocr
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_json
}
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
image_writer
=
FileBasedDataWriter
(
image_dir
)
parse_method
=
'ocr'
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
)
pipe
.
pipe_classify
()
else
:
else
:
jso_useful_key
=
{
'_pdf_type'
:
'ocr'
,
'model_list'
:
model_json
}
parse_method
=
'txt'
image_writer
=
FileBasedDataWriter
(
image_dir
)
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
)
if
parse_method
==
'ocr'
:
"""如果没有传入有效的模型数据,则使用内置model解析"""
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
)
if
len
(
model_json
)
==
0
:
if
model_config
.
__use_inside_model__
:
pipe
.
pipe_analyze
()
else
:
else
:
logger
.
error
(
'need model list input'
)
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
)
exit
(
1
)
pipe
.
pipe_parse
()
if
parse_method
==
'ocr'
:
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
pipe
.
get_compress_pdf_mid_data
())
pipe_res
=
infer_result
.
pipe_ocr_mode
(
image_writer
)
else
:
pipe_res
=
infer_result
.
pipe_txt_mode
(
image_writer
)
pdf_mid_data
=
pipe_res
.
_pipe_res
pdf_info_list
=
pdf_mid_data
[
'pdf_info'
]
pdf_info_list
=
pdf_mid_data
[
'pdf_info'
]
md_content
=
json
.
dumps
(
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_list
,
image_url_prefix
),
md_content
=
json
.
dumps
(
ocr_mk_mm_markdown_with_para_and_pagination
(
pdf_info_list
,
image_url_prefix
),
ensure_ascii
=
False
)
ensure_ascii
=
False
)
...
@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
...
@@ -52,7 +58,6 @@ def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
except
Exception
as
e
:
# noqa: F841
except
Exception
as
e
:
# noqa: F841
logger
.
error
(
traceback
.
format_exc
())
logger
.
error
(
traceback
.
format_exc
())
def
get_bbox_info
(
data
):
def
get_bbox_info
(
data
):
bbox_info
=
[]
bbox_info
=
[]
for
page
in
data
:
for
page
in
data
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment