Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
38ace5dc
Commit
38ace5dc
authored
Jun 03, 2025
by
myhloli
Browse files
refactor: streamline document analysis and enhance image handling in processing pipeline
parent
68338825
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
63 additions
and
78 deletions
+63
-78
mineru/backend/pipeline/pipeline_analyze.py
mineru/backend/pipeline/pipeline_analyze.py
+1
-23
mineru/cli/common.py
mineru/cli/common.py
+59
-52
mineru/utils/cut_image.py
mineru/utils/cut_image.py
+1
-1
mineru/utils/pdf_image_tools.py
mineru/utils/pdf_image_tools.py
+2
-2
No files found.
mineru/backend/pipeline/pipeline_analyze.py
View file @
38ace5dc
...
...
@@ -80,23 +80,10 @@ def custom_model_init(
def
doc_analyze
(
pdf_bytes_list
,
lang_list
,
image_writer
:
DataWriter
|
None
,
parse_method
:
str
=
'auto'
,
formula_enable
=
None
,
table_enable
=
None
,
):
"""
统一处理文档分析函数,根据输入参数类型决定处理单个数据集还是多个数据集
Args:
dataset_or_datasets: 单个Dataset对象或Dataset对象列表
parse_method: 解析方法,'auto'/'ocr'/'txt'
formula_enable: 是否启用公式识别
table_enable: 是否启用表格识别
Returns:
单个dataset时返回单个model_json,多个dataset时返回model_json列表
"""
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
# 收集所有页面信息
...
...
@@ -159,16 +146,7 @@ def doc_analyze(
page_dict
=
{
'layout_dets'
:
result
,
'page_info'
:
page_info_dict
}
infer_results
[
pdf_idx
][
page_idx
]
=
page_dict
middle_json_list
=
[]
for
pdf_idx
,
model_list
in
enumerate
(
infer_results
):
images_list
=
all_image_lists
[
pdf_idx
]
pdf_doc
=
all_pdf_docs
[
pdf_idx
]
_lang
=
lang_list
[
pdf_idx
]
_ocr
=
ocr_enabled_list
[
pdf_idx
]
middle_json
=
result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr
)
middle_json_list
.
append
(
middle_json
)
return
middle_json_list
,
infer_results
return
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
def
batch_image_analyze
(
...
...
mineru/cli/common.py
View file @
38ace5dc
...
...
@@ -6,6 +6,8 @@ from pathlib import Path
import
pypdfium2
as
pdfium
from
loguru
import
logger
from
mineru.backend.pipeline.model_json_to_middle_json
import
result_to_middle_json
as
pipeline_result_to_middle_json
from
..api.vlm_middle_json_mkcontent
import
union_make
from
..backend.vlm.vlm_analyze
import
doc_analyze
as
vlm_doc_analyze
from
..backend.pipeline.pipeline_analyze
import
doc_analyze
as
pipeline_doc_analyze
...
...
@@ -30,10 +32,8 @@ def read_fn(path: Path):
def
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
):
local_parent_dir
=
os
.
path
.
join
(
output_dir
,
pdf_file_name
,
parse_method
)
local_image_dir
=
os
.
path
.
join
(
str
(
local_parent_dir
),
"images"
)
local_md_dir
=
local_parent_dir
local_md_dir
=
str
(
os
.
path
.
join
(
output_dir
,
pdf_file_name
,
parse_method
))
local_image_dir
=
os
.
path
.
join
(
str
(
local_md_dir
),
"images"
)
os
.
makedirs
(
local_image_dir
,
exist_ok
=
True
)
os
.
makedirs
(
local_md_dir
,
exist_ok
=
True
)
return
local_image_dir
,
local_md_dir
...
...
@@ -95,15 +95,23 @@ def do_parse(
if
backend
==
"pipeline"
:
for
pdf_bytes
in
pdf_bytes_list
:
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
middle_json_list
,
infer_results
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_formula_enable
,
table_enable
=
p_table_enable
)
for
idx
,
middle_json
in
enumerate
(
middle_json_list
):
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_formula_enable
,
table_enable
=
p_table_enable
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
pdf_file_name
=
pdf_file_names
[
idx
]
model_json
=
infer_results
[
idx
]
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
images_list
=
all_image_lists
[
idx
]
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr
)
pdf_info
=
middle_json
[
"pdf_info"
]
pdf_bytes
=
pdf_bytes_list
[
idx
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
)
...
...
@@ -155,52 +163,51 @@ def do_parse(
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
middle_json
,
infer_result
=
vlm_doc_analyze
(
pdf_bytes
,
image_writer
=
image_writer
,
backend
=
backend
,
model_path
=
model_path
,
server_url
=
server_url
)
pdf_info
=
middle_json
[
"pdf_info"
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
return
infer_result
pdf_info
=
middle_json
[
"pdf_info"
]
if
f_draw_layout_bbox
:
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_layout.pdf"
)
if
f_draw_span_bbox
:
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
local_md_dir
,
f
"
{
pdf_file_name
}
_span.pdf"
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
"
{
pdf_file_name
}
_origin.pdf"
,
pdf_bytes
,
)
if
f_dump_md
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
md_content_str
=
union_make
(
pdf_info
,
f_make_md_mode
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
.md"
,
md_content_str
,
)
if
f_dump_content_list
:
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
content_list
=
union_make
(
pdf_info
,
MakeMode
.
STANDARD_FORMAT
,
image_dir
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_content_list.json"
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_middle_json
:
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_middle.json"
,
json
.
dumps
(
middle_json
,
ensure_ascii
=
False
,
indent
=
4
),
)
if
f_dump_model_output
:
model_output
=
(
"
\n
"
+
"-"
*
50
+
"
\n
"
).
join
(
infer_result
)
md_writer
.
write_string
(
f
"
{
pdf_file_name
}
_model_output.txt"
,
model_output
,
)
logger
.
info
(
f
"local output dir is
{
local_md_dir
}
"
)
if
__name__
==
"__main__"
:
...
...
mineru/utils/cut_image.py
View file @
38ace5dc
...
...
@@ -14,7 +14,7 @@ def cut_image_and_table(span, page_pil_img, page_img_md5, page_id, image_writer,
span
[
"image_path"
]
=
""
else
:
span
[
"image_path"
]
=
cut_image
(
span
[
"bbox"
],
page_id
,
page_pil_img
,
return_path
=
return_path
(
span_type
),
image
W
riter
=
image
W
riter
,
scale
=
scale
span
[
"bbox"
],
page_id
,
page_pil_img
,
return_path
=
return_path
(
span_type
),
image
_w
riter
=
image
_w
riter
,
scale
=
scale
)
return
span
...
...
mineru/utils/pdf_image_tools.py
View file @
38ace5dc
...
...
@@ -54,7 +54,7 @@ def load_images_from_pdf(
return
images_list
,
pdf_doc
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page_pil_img
,
return_path
,
image
W
riter
:
FileBasedDataWriter
,
scale
=
2
):
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page_pil_img
,
return_path
,
image
_w
riter
:
FileBasedDataWriter
,
scale
=
2
):
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
图片存放在save_path下,文件名是:
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
...
...
@@ -73,7 +73,7 @@ def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, imageWriter
img_bytes
=
image_to_bytes
(
crop_img
,
image_format
=
"JPEG"
)
image
W
riter
.
write
(
img_hash256_path
,
img_bytes
)
image
_w
riter
.
write
(
img_hash256_path
,
img_bytes
)
return
img_hash256_path
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment