Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
43164533
Commit
43164533
authored
Apr 10, 2025
by
icecraft
Browse files
feat: inference with iter style
parent
ce212da1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
70 deletions
+30
-70
magic_pdf/data/batch_build_dataset.py
magic_pdf/data/batch_build_dataset.py
+4
-41
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+1
-10
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+25
-19
No files found.
magic_pdf/data/batch_build_dataset.py
View file @
43164533
...
@@ -107,50 +107,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
...
@@ -107,50 +107,13 @@ def batch_build_dataset(pdf_paths, k, lang=None):
pdf_info
=
[]
pdf_info
=
[]
total_pages
=
0
total_pages
=
0
results
=
[]
for
pdf_path
in
pdf_paths
:
for
pdf_path
in
pdf_paths
:
try
:
try
:
doc
=
fitz
.
open
(
pdf_path
)
with
open
(
pdf_path
,
'rb'
)
as
f
:
num_pages
=
len
(
doc
)
bits
=
f
.
read
()
pdf_info
.
append
((
pdf_path
,
num_pages
))
results
.
append
(
PymuDocDataset
(
bits
,
lang
))
total_pages
+=
num_pages
doc
.
close
()
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
'Error opening
{
pdf_path
}
:
{
e
}
'
)
print
(
f
'Error opening
{
pdf_path
}
:
{
e
}
'
)
# Partition the jobs based on page countEach job has 1 page
partitions
=
partition_array_greedy
(
pdf_info
,
k
)
# Process each partition in parallel
all_images_h
=
{}
with
concurrent
.
futures
.
ProcessPoolExecutor
(
max_workers
=
k
)
as
executor
:
# Submit one task per partition
futures
=
[]
for
sn
,
partition
in
enumerate
(
partitions
):
# Get the jobs for this partition
partition_jobs
=
[
pdf_info
[
idx
]
for
idx
in
partition
]
# Submit the task
future
=
executor
.
submit
(
process_pdf_batch
,
partition_jobs
,
sn
)
futures
.
append
(
future
)
# Process results as they complete
for
i
,
future
in
enumerate
(
concurrent
.
futures
.
as_completed
(
futures
)):
try
:
idx
,
images
=
future
.
result
()
all_images_h
[
idx
]
=
images
except
Exception
as
e
:
print
(
f
'Error processing partition:
{
e
}
'
)
results
=
[
None
]
*
len
(
pdf_paths
)
for
i
in
range
(
len
(
partitions
)):
partition
=
partitions
[
i
]
for
j
in
range
(
len
(
partition
)):
with
open
(
pdf_info
[
partition
[
j
]][
0
],
'rb'
)
as
f
:
pdf_bytes
=
f
.
read
()
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
dataset
.
set_images
(
all_images_h
[
i
][
j
])
results
[
partition
[
j
]]
=
dataset
return
results
return
results
magic_pdf/data/dataset.py
View file @
43164533
...
@@ -342,17 +342,8 @@ class Doc(PageableData):
...
@@ -342,17 +342,8 @@ class Doc(PageableData):
height: int
height: int
}
}
"""
"""
if
self
.
_img
is
None
:
return
fitz_doc_to_image
(
self
.
_doc
)
self
.
_img
=
fitz_doc_to_image
(
self
.
_doc
)
return
self
.
_img
def
set_image
(
self
,
img
):
"""
Args:
img (np.ndarray): the image
"""
if
self
.
_img
is
None
:
self
.
_img
=
img
def
get_doc
(
self
)
->
fitz
.
Page
:
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc object.
"""Get the pymudoc object.
...
...
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
43164533
...
@@ -138,30 +138,31 @@ def doc_analyze(
...
@@ -138,30 +138,31 @@ def doc_analyze(
)
)
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
200
))
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
200
))
batch_size
=
MIN_BATCH_INFERENCE_SIZE
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
images_with_extra_info
=
[]
results
=
[]
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
if
start_page_id
<=
index
<=
end_page_id
:
if
start_page_id
<=
index
<=
end_page_id
:
page_data
=
dataset
.
get_page
(
index
)
page_data
=
dataset
.
get_page
(
index
)
img_dict
=
page_data
.
get_image
()
img_dict
=
page_data
.
get_image
()
images
.
append
(
img_dict
[
'img'
])
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
lang
is
None
or
lang
==
'auto'
:
if
lang
is
None
or
lang
==
'auto'
:
images_with_extra_info
.
append
((
images
[
index
],
ocr
,
dataset
.
_lang
))
images_with_extra_info
=
[(
images
[
index
],
ocr
,
dataset
.
_lang
)
for
index
in
range
(
len
(
images
))]
else
:
else
:
images_with_extra_info
.
append
((
images
[
index
],
ocr
,
lang
))
images_with_extra_info
=
[(
images
[
index
],
ocr
,
lang
)
for
index
in
range
(
len
(
images
))]
if
len
(
images_with_extra_info
)
==
batch_size
:
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
batch_size
=
MIN_BATCH_INFERENCE_SIZE
results
.
extend
(
result
)
batch_images
=
[
images_with_extra_info
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images_with_extra_info
),
batch_size
)]
images_with_extra_info
=
[]
else
:
batch_images
=
[
images_with_extra_info
]
if
len
(
images_with_extra_info
)
>
0
:
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
=
[]
for
sn
,
batch_image
in
enumerate
(
batch_images
):
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
results
.
extend
(
result
)
images_with_extra_info
=
[]
model_json
=
[]
model_json
=
[]
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
...
@@ -193,6 +194,7 @@ def batch_doc_analyze(
...
@@ -193,6 +194,7 @@ def batch_doc_analyze(
batch_size
=
MIN_BATCH_INFERENCE_SIZE
batch_size
=
MIN_BATCH_INFERENCE_SIZE
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
results
=
[]
images_with_extra_info
=
[]
images_with_extra_info
=
[]
for
dataset
in
datasets
:
for
dataset
in
datasets
:
...
@@ -211,11 +213,15 @@ def batch_doc_analyze(
...
@@ -211,11 +213,15 @@ def batch_doc_analyze(
else
:
else
:
images_with_extra_info
.
append
((
images
[
-
1
],
parse_method
==
'ocr'
,
_lang
))
images_with_extra_info
.
append
((
images
[
-
1
],
parse_method
==
'ocr'
,
_lang
))
batch_images
=
[
images_with_extra_info
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images_with_extra_info
),
batch_size
)]
if
len
(
images_with_extra_info
)
==
batch_size
:
results
=
[]
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
for
sn
,
batch_image
in
enumerate
(
batch_images
):
results
.
extend
(
result
)
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
images_with_extra_info
=
[]
if
len
(
images_with_extra_info
)
>
0
:
_
,
result
=
may_batch_image_analyze
(
images_with_extra_info
,
0
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
results
.
extend
(
result
)
images_with_extra_info
=
[]
infer_results
=
[]
infer_results
=
[]
from
magic_pdf.operators.models
import
InferenceResult
from
magic_pdf.operators.models
import
InferenceResult
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment