Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b3faee93
Unverified
Commit
b3faee93
authored
Mar 19, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Mar 19, 2025
Browse files
Merge pull request #1944 from icecraft/feat/remove_intree_parallel_inference
style: remove unused code
parents
e33ec616
e9c24739
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
78 deletions
+22
-78
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+21
-77
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+1
-1
No files found.
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
b3faee93
...
...
@@ -34,8 +34,6 @@ from magic_pdf.model.model_list import MODEL
# from magic_pdf.operators.models import InferenceResult
MIN_BATCH_INFERENCE_SIZE
=
100
class
ModelSingleton
:
_instance
=
None
_models
=
{}
...
...
@@ -143,17 +141,14 @@ def doc_analyze(
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
one_shot
:
bool
=
True
,
):
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
dataset
)
-
1
)
parallel_count
=
None
if
os
.
environ
.
get
(
'MINERU_PARALLEL_INFERENCE_COUNT'
):
parallel_count
=
int
(
os
.
environ
[
'MINERU_PARALLEL_INFERENCE_COUNT'
])
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
images
=
[]
page_wh_list
=
[]
for
index
in
range
(
len
(
dataset
)):
...
...
@@ -163,41 +158,16 @@ def doc_analyze(
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
one_shot
and
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
if
parallel_count
is
None
:
parallel_count
=
2
# should check the gpu memory firstly !
# split images into parallel_count batches
if
parallel_count
>
1
:
batch_size
=
(
len
(
images
)
+
parallel_count
-
1
)
//
parallel_count
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
else
:
batch_images
=
[
images
]
results
=
[]
parallel_count
=
len
(
batch_images
)
# adjust to real parallel count
# using concurrent.futures to analyze
"""
with fut.ProcessPoolExecutor(max_workers=parallel_count) as executor:
futures = [executor.submit(may_batch_image_analyze, batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)]
for future in fut.as_completed(futures):
sn, result = future.result()
result_history[sn] = result
for key in sorted(result_history.keys()):
results.extend(result_history[key])
"""
results
=
[]
pool
=
mp
.
Pool
(
processes
=
parallel_count
)
mapped_results
=
pool
.
starmap
(
may_batch_image_analyze
,
[(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
for
sn
,
batch_image
in
enumerate
(
batch_images
)])
for
sn
,
result
in
mapped_results
:
results
.
extend
(
result
)
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
batch_size
=
MIN_BATCH_INFERENCE_SIZE
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
else
:
_
,
results
=
may_batch_image_analyze
(
images
,
0
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
batch_images
=
[
images
]
results
=
[]
for
sn
,
batch_image
in
enumerate
(
batch_images
):
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
model_json
=
[]
for
index
in
range
(
len
(
dataset
)):
...
...
@@ -224,11 +194,8 @@ def batch_doc_analyze(
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
one_shot
:
bool
=
True
,
):
parallel_count
=
None
if
os
.
environ
.
get
(
'MINERU_PARALLEL_INFERENCE_COUNT'
):
parallel_count
=
int
(
os
.
environ
[
'MINERU_PARALLEL_INFERENCE_COUNT'
])
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
images
=
[]
page_wh_list
=
[]
for
dataset
in
datasets
:
...
...
@@ -238,40 +205,17 @@ def batch_doc_analyze(
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
one_shot
and
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
if
parallel_count
is
None
:
parallel_count
=
2
# should check the gpu memory firstly !
# split images into parallel_count batches
if
parallel_count
>
1
:
batch_size
=
(
len
(
images
)
+
parallel_count
-
1
)
//
parallel_count
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
else
:
batch_images
=
[
images
]
results
=
[]
parallel_count
=
len
(
batch_images
)
# adjust to real parallel count
# using concurrent.futures to analyze
"""
with fut.ProcessPoolExecutor(max_workers=parallel_count) as executor:
futures = [executor.submit(may_batch_image_analyze, batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)]
for future in fut.as_completed(futures):
sn, result = future.result()
result_history[sn] = result
for key in sorted(result_history.keys()):
results.extend(result_history[key])
"""
results
=
[]
pool
=
mp
.
Pool
(
processes
=
parallel_count
)
mapped_results
=
pool
.
starmap
(
may_batch_image_analyze
,
[(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
for
sn
,
batch_image
in
enumerate
(
batch_images
)])
for
sn
,
result
in
mapped_results
:
results
.
extend
(
result
)
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
batch_size
=
MIN_BATCH_INFERENCE_SIZE
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
else
:
_
,
results
=
may_batch_image_analyze
(
images
,
0
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
batch_images
=
[
images
]
results
=
[]
for
sn
,
batch_image
in
enumerate
(
batch_images
):
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
infer_results
=
[]
from
magic_pdf.operators.models
import
InferenceResult
...
...
magic_pdf/tools/common.py
View file @
b3faee93
...
...
@@ -314,7 +314,7 @@ def batch_do_parse(
dss
.
append
(
PymuDocDataset
(
v
,
lang
=
lang
))
else
:
dss
.
append
(
v
)
infer_results
=
batch_doc_analyze
(
dss
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
one_shot
=
True
)
infer_results
=
batch_doc_analyze
(
dss
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
for
idx
,
infer_result
in
enumerate
(
infer_results
):
_do_parse
(
output_dir
,
pdf_file_names
[
idx
],
dss
[
idx
],
infer_result
.
get_infer_res
(),
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment