Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a881ee89
Unverified
Commit
a881ee89
authored
Apr 10, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Apr 10, 2025
Browse files
Merge pull request #2177 from icecraft/feat/iterator_inference
feat: inference with iter style
parents
ce212da1
43164533
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
70 deletions
+30
-70
magic_pdf/data/batch_build_dataset.py
magic_pdf/data/batch_build_dataset.py
+4
-41
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+1
-10
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+25
-19
No files found.
magic_pdf/data/batch_build_dataset.py
View file @
a881ee89
...
@@ -107,50 +107,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
...
@@ -107,50 +107,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
pdf_info
=
[]
pdf_info
=
[]
total_pages
=
0
total_pages
=
0
results
=
[]
for
pdf_path
in
pdf_paths
:
for
pdf_path
in
pdf_paths
:
try
:
try
:
doc
=
fitz
.
open
(
pdf_path
)
with
open
(
pdf_path
,
'rb'
)
as
f
:
num_pages
=
len
(
doc
)
bits
=
f
.
read
()
pdf_info
.
append
((
pdf_path
,
num_pages
))
results
.
append
(
PymuDocDataset
(
bits
,
lang
))
total_pages
+=
num_pages
doc
.
close
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
'Error opening
{
pdf_path
}
:
{
e
}
'
)
print
(
f
'Error opening
{
pdf_path
}
:
{
e
}
'
)
# Partition the jobs based on page countEach job has 1 page
partitions
=
partition_array_greedy
(
pdf_info
,
k
)
# Process each partition in parallel
all_images_h
=
{}
with
concurrent
.
futures
.
ProcessPoolExecutor
(
max_workers
=
k
)
as
executor
:
# Submit one task per partition
futures
=
[]
for
sn
,
partition
in
enumerate
(
partitions
):
# Get the jobs for this partition
partition_jobs
=
[
pdf_info
[
idx
]
for
idx
in
partition
]
# Submit the task
future
=
executor
.
submit
(
process_pdf_batch
,
partition_jobs
,
sn
)
futures
.
append
(
future
)
# Process results as they complete
for
i
,
future
in
enumerate
(
concurrent
.
futures
.
as_completed
(
futures
)):
try
:
idx
,
images
=
future
.
result
()
all_images_h
[
idx
]
=
images
except
Exception
as
e
:
print
(
f
'Error processing partition:
{
e
}
'
)
results
=
[
None
]
*
len
(
pdf_paths
)
for
i
in
range
(
len
(
partitions
)):
partition
=
partitions
[
i
]
for
j
in
range
(
len
(
partition
)):
with
open
(
pdf_info
[
partition
[
j
]][
0
],
'rb'
)
as
f
:
pdf_bytes
=
f
.
read
()
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
dataset
.
set_images
(
all_images_h
[
i
][
j
])
results
[
partition
[
j
]]
=
dataset
return
results
return
results
magic_pdf/data/dataset.py
View file @
a881ee89
...
@@ -342,17 +342,8 @@ class Doc(PageableData):
...
@@ -342,17 +342,8 @@ class Doc(PageableData):
height: int
height: int
}
}
"""
"""
if
self
.
_img
is
None
:
return
fitz_doc_to_image
(
self
.
_doc
)
self
.
_img
=
fitz_doc_to_image
(
self
.
_doc
)
return
self
.
_img
def
set_image
(
self
,
img
):
"""
Args:
img (np.ndarray): the image
"""
if
self
.
_img
is
None
:
self
.
_img
=
img
def
get_doc
(
self
)
->
fitz
.
Page
:
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc object.
"""Get the pymudoc object.
...
...
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
a881ee89
...
@@ -138,30 +138,31 @@ def doc_analyze(
...
@@ -138,30 +138,31 @@ def doc_analyze(
)
)
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
200
))
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
200
))
batch_size
=
MIN_BATCH_INFERENCE_SIZE
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
images_with_extra_info
=
[]
results
=
[]
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
if
start_page_id
<=
index
<=
end_page_id
:
if
start_page_id
<=
index
<=
end_page_id
:
page_data
=
dataset
.
get_page
(
index
)
page_data
=
dataset
.
get_page
(
index
)
img_dict
=
page_data
.
get_image
()
img_dict
=
page_data
.
get_image
()
images
.
append
(
img_dict
[
'img'
])
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
lang
is
None
or
lang
==
'auto'
:
if
lang
is
None
or
lang
==
'auto'
:
images_with_extra_info
.
append
((
images
[
index
],
ocr
,
dataset
.
_lang
))
images_with_extra_info
=
[(
images
[
index
],
ocr
,
dataset
.
_lang
)
for
index
in
range
(
len
(
images
))]
else
:
else
:
images_with_extra_info
.
append
((
images
[
index
],
ocr
,
lang
))
images_with_extra_info
=
[(
images
[
index
],
ocr
,
lang
)
for
index
in
range
(
len
(
images
))]
if
len
(
images_with_extra_info
)
==
batch_size
:
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
batch_size
=
MIN_BATCH_INFERENCE_SIZE
results
.
extend
(
result
)
batch_images
=
[
images_with_extra_info
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images_with_extra_info
),
batch_size
)]
images_with_extra_info
=
[]
else
:
batch_images
=
[
images_with_extra_info
]
if
len
(
images_with_extra_info
)
>
0
:
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
=
[]
for
sn
,
batch_image
in
enumerate
(
batch_images
):
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
results
.
extend
(
result
)
images_with_extra_info
=
[]
model_json
=
[]
model_json
=
[]
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
...
@@ -193,6 +194,7 @@ def batch_doc_analyze(
...
@@ -193,6 +194,7 @@ def batch_doc_analyze(
batch_size
=
MIN_BATCH_INFERENCE_SIZE
batch_size
=
MIN_BATCH_INFERENCE_SIZE
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
results
=
[]
images_with_extra_info
=
[]
images_with_extra_info
=
[]
for
dataset
in
datasets
:
for
dataset
in
datasets
:
...
@@ -211,11 +213,15 @@ def batch_doc_analyze(
...
@@ -211,11 +213,15 @@ def batch_doc_analyze(
else
:
else
:
images_with_extra_info
.
append
((
images
[
-
1
],
parse_method
==
'ocr'
,
_lang
))
images_with_extra_info
.
append
((
images
[
-
1
],
parse_method
==
'ocr'
,
_lang
))
batch_images
=
[
images_with_extra_info
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images_with_extra_info
),
batch_size
)]
if
len
(
images_with_extra_info
)
==
batch_size
:
results
=
[]
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
for
sn
,
batch_image
in
enumerate
(
batch_images
):
results
.
extend
(
result
)
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
images_with_extra_info
=
[]
if
len
(
images_with_extra_info
)
>
0
:
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
results
.
extend
(
result
)
images_with_extra_info
=
[]
infer_results
=
[]
infer_results
=
[]
from
magic_pdf.operators.models
import
InferenceResult
from
magic_pdf.operators.models
import
InferenceResult
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment