Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
91aa7e56
Unverified
Commit
91aa7e56
authored
Dec 12, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 12, 2024
Browse files
Merge pull request #1277 from opendatalab/master
remove old demo code
parents
5bbd07a1
c65c0478
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
0 additions
and
179 deletions
+0
-179
README.md
README.md
+0
-4
README_zh-CN.md
README_zh-CN.md
+0
-4
demo/demo.py
demo/demo.py
+0
-25
demo/magic_pdf_parse_main.py
demo/magic_pdf_parse_main.py
+0
-146
No files found.
README.md
View file @
91aa7e56
...
...
@@ -286,10 +286,6 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
[
Using MinerU via Python API
](
https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html
)
For detailed implementation, refer to:
-
[
demo.py Simplest Processing Method
](
demo/demo.py
)
-
[
magic_pdf_parse_main.py More Detailed Processing Workflow
](
demo/magic_pdf_parse_main.py
)
### Deploy Derived Projects
...
...
README_zh-CN.md
View file @
91aa7e56
...
...
@@ -293,10 +293,6 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
[
通过Python代码调用MinerU
](
https://mineru.readthedocs.io/zh-cn/latest/user_guide/quick_start/to_markdown.html
)
详细实现可参考
-
[
demo.py 最简单的处理方式
](
demo/demo.py
)
-
[
magic_pdf_parse_main.py 能够更清晰看到处理流程
](
demo/magic_pdf_parse_main.py
)
### 部署衍生项目
...
...
demo/demo.py
deleted
100644 → 0
View file @
5bbd07a1
import
os
from
loguru
import
logger
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
try
:
current_script_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
demo_name
=
'demo1'
pdf_path
=
os
.
path
.
join
(
current_script_dir
,
f
'
{
demo_name
}
.pdf'
)
pdf_bytes
=
open
(
pdf_path
,
'rb'
).
read
()
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
[]}
local_image_dir
=
os
.
path
.
join
(
current_script_dir
,
'images'
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
image_writer
=
FileBasedDataWriter
(
local_image_dir
)
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
)
pipe
.
pipe_classify
()
pipe
.
pipe_analyze
()
pipe
.
pipe_parse
()
md_content
=
pipe
.
pipe_mk_markdown
(
image_dir
,
drop_mode
=
'none'
)
with
open
(
f
'
{
demo_name
}
.md'
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
md_content
)
except
Exception
as
e
:
logger
.
exception
(
e
)
demo/magic_pdf_parse_main.py
deleted
100644 → 0
View file @
5bbd07a1
import
copy
import
json
import
os
from
loguru
import
logger
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.libs.draw_bbox
import
draw_layout_bbox
,
draw_span_bbox
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
# todo: 设备类型选择 (?)
def
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
,
orig_model_list
,
):
# 写入模型结果到 model.json
md_writer
.
write_string
(
f
'
{
pdf_name
}
_model.json'
,
json
.
dumps
(
orig_model_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
# 写入中间结果到 middle.json
md_writer
.
write_string
(
f
'
{
pdf_name
}
_middle.json'
,
json
.
dumps
(
pipe
.
pdf_mid_data
,
ensure_ascii
=
False
,
indent
=
4
)
)
# text文本结果写入到 conent_list.json
md_writer
.
write_string
(
f
'
{
pdf_name
}
_content_list.json'
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
# 写入结果到 .md 文件中
md_writer
.
write_string
(
f
'
{
pdf_name
}
.md'
,
md_content
,
)
# 可视化
def
draw_visualization_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
):
# 画布局框,附带排序结果
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
)
# 画 span 框
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
pdf_file_name
)
def
pdf_parse_main
(
pdf_path
:
str
,
parse_method
:
str
=
'auto'
,
model_json_path
:
str
=
None
,
is_json_md_dump
:
bool
=
True
,
is_draw_visualization_bbox
:
bool
=
True
,
output_dir
:
str
=
None
):
"""执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录.
:param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr
:param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中
:param is_draw_visualization_bbox: 是否绘制可视化边界框,默认 True,会生成布局框和 span 框的图像
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果
"""
try
:
pdf_name
=
os
.
path
.
basename
(
pdf_path
).
split
(
'.'
)[
0
]
pdf_path_parent
=
os
.
path
.
dirname
(
pdf_path
)
if
output_dir
:
output_path
=
os
.
path
.
join
(
output_dir
,
pdf_name
)
else
:
output_path
=
os
.
path
.
join
(
pdf_path_parent
,
pdf_name
)
output_image_path
=
os
.
path
.
join
(
output_path
,
'images'
)
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent
=
os
.
path
.
basename
(
output_image_path
)
pdf_bytes
=
open
(
pdf_path
,
'rb'
).
read
()
# 读取 pdf 文件的二进制数据
orig_model_list
=
[]
if
model_json_path
:
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型
model_json
=
json
.
loads
(
open
(
model_json_path
,
'r'
,
encoding
=
'utf-8'
).
read
())
orig_model_list
=
copy
.
deepcopy
(
model_json
)
else
:
model_json
=
[]
# 执行解析步骤
image_writer
,
md_writer
=
FileBasedDataWriter
(
output_image_path
),
FileBasedDataWriter
(
output_path
)
# 选择解析方式
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_json
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
)
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_json
,
image_writer
)
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_json
,
image_writer
)
else
:
logger
.
error
(
'unknown parse method, only auto, ocr, txt allowed'
)
exit
(
1
)
# 执行分类
pipe
.
pipe_classify
()
# 如果没有传入模型数据,则使用内置模型解析
if
len
(
model_json
)
==
0
:
pipe
.
pipe_analyze
()
# 解析
orig_model_list
=
copy
.
deepcopy
(
pipe
.
model_list
)
# 执行解析
pipe
.
pipe_parse
()
# 保存 text 和 md 格式的结果
content_list
=
pipe
.
pipe_mk_uni_format
(
image_path_parent
,
drop_mode
=
'none'
)
md_content
=
pipe
.
pipe_mk_markdown
(
image_path_parent
,
drop_mode
=
'none'
)
if
is_json_md_dump
:
json_md_dump
(
pipe
,
md_writer
,
pdf_name
,
content_list
,
md_content
,
orig_model_list
)
if
is_draw_visualization_bbox
:
draw_visualization_bbox
(
pipe
.
pdf_mid_data
[
'pdf_info'
],
pdf_bytes
,
output_path
,
pdf_name
)
except
Exception
as
e
:
logger
.
exception
(
e
)
# 测试
if
__name__
==
'__main__'
:
current_script_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
demo_names
=
[
'demo1'
,
'demo2'
,
'small_ocr'
]
for
name
in
demo_names
:
file_path
=
os
.
path
.
join
(
current_script_dir
,
f
'
{
name
}
.pdf'
)
pdf_parse_main
(
file_path
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment