Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7285ea92
Commit
7285ea92
authored
May 29, 2025
by
myhloli
Browse files
refactor: improve document analysis by integrating image loading and enhancing data handling
parent
ea5cb65a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
11 deletions
+14
-11
mineru/backend/pipeline/pipeline_analyze.py
mineru/backend/pipeline/pipeline_analyze.py
+14
-11
No files found.
mineru/backend/pipeline/pipeline_analyze.py
View file @
7285ea92
...
@@ -6,11 +6,9 @@ from pypdfium2 import PdfDocument
...
@@ -6,11 +6,9 @@ from pypdfium2 import PdfDocument
from
mineru.backend.pipeline.model_init
import
MineruPipelineModel
from
mineru.backend.pipeline.model_init
import
MineruPipelineModel
from
.model_json_to_middle_json
import
result_to_middle_json
from
.model_json_to_middle_json
import
result_to_middle_json
from
...data.data_reader_writer
import
DataWriter
from
...utils.pdf_classify
import
classify
from
...utils.pdf_classify
import
classify
from
...utils.pdf_image_tools
import
pdf_page_to_image
from
...utils.pdf_image_tools
import
load_images_from_pdf
from
loguru
import
logger
from
loguru
import
logger
...
@@ -87,6 +85,7 @@ def custom_model_init(
...
@@ -87,6 +85,7 @@ def custom_model_init(
def
doc_analyze
(
def
doc_analyze
(
pdf_bytes_list
,
pdf_bytes_list
,
lang_list
,
lang_list
,
image_writer
:
DataWriter
|
None
,
parse_method
:
str
=
'auto'
,
parse_method
:
str
=
'auto'
,
formula_enable
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
table_enable
=
None
,
...
@@ -108,6 +107,8 @@ def doc_analyze(
...
@@ -108,6 +107,8 @@ def doc_analyze(
# 收集所有页面信息
# 收集所有页面信息
all_pages_info
=
[]
# 存储(dataset_index, page_index, img, ocr, lang, width, height)
all_pages_info
=
[]
# 存储(dataset_index, page_index, img, ocr, lang, width, height)
all_image_lists
=
[]
all_pdf_docs
=
[]
for
pdf_idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
for
pdf_idx
,
pdf_bytes
in
enumerate
(
pdf_bytes_list
):
# 确定OCR设置
# 确定OCR设置
_ocr
=
False
_ocr
=
False
...
@@ -120,14 +121,14 @@ def doc_analyze(
...
@@ -120,14 +121,14 @@ def doc_analyze(
_lang
=
lang_list
[
pdf_idx
]
_lang
=
lang_list
[
pdf_idx
]
# 收集每个数据集中的页面
# 收集每个数据集中的页面
pdf_doc
=
PdfDocument
(
pdf_bytes
)
images_list
,
pdf_doc
=
load_images_from_pdf
(
pdf_bytes
)
for
page_idx
in
range
(
len
(
pdf_doc
)):
all_image_lists
.
append
(
images_list
)
page_data
=
pdf_doc
[
page_idx
]
all_pdf_docs
.
append
(
pdf_doc
)
img_dict
=
pdf_page_to_image
(
page_data
)
for
page_idx
in
range
(
len
(
images_list
)):
img_dict
=
images_list
[
page_idx
]
all_pages_info
.
append
((
all_pages_info
.
append
((
pdf_idx
,
page_idx
,
pdf_idx
,
page_idx
,
img_dict
[
'img_pil'
],
_ocr
,
_lang
,
img_dict
[
'img_pil'
],
_ocr
,
_lang
,
img_dict
[
'scale'
]
))
))
# 准备批处理
# 准备批处理
...
@@ -164,8 +165,10 @@ def doc_analyze(
...
@@ -164,8 +165,10 @@ def doc_analyze(
infer_results
[
pdf_idx
].
append
(
page_dict
)
infer_results
[
pdf_idx
].
append
(
page_dict
)
middle_json_list
=
[]
middle_json_list
=
[]
for
model_json
in
infer_results
:
for
pdf_idx
,
model_json
in
enumerate
(
infer_results
):
middle_json
=
result_to_middle_json
(
model_json
)
images_list
=
all_image_lists
[
pdf_idx
]
pdf_doc
=
all_pdf_docs
[
pdf_idx
]
middle_json
=
result_to_middle_json
(
model_json
,
images_list
,
pdf_doc
,
image_writer
)
middle_json_list
.
append
(
middle_json
)
middle_json_list
.
append
(
middle_json
)
return
middle_json_list
,
infer_results
return
middle_json_list
,
infer_results
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment