Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b3faee93
Unverified
Commit
b3faee93
authored
Mar 19, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Mar 19, 2025
Browse files
Merge pull request #1944 from icecraft/feat/remove_intree_parallel_inference
style: remove unused code
parents
e33ec616
e9c24739
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
22 additions
and
78 deletions
+22
-78
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+21
-77
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+1
-1
No files found.
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
b3faee93
...
@@ -34,8 +34,6 @@ from magic_pdf.model.model_list import MODEL
...
@@ -34,8 +34,6 @@ from magic_pdf.model.model_list import MODEL
# from magic_pdf.operators.models import InferenceResult
# from magic_pdf.operators.models import InferenceResult
MIN_BATCH_INFERENCE_SIZE
=
100
class
ModelSingleton
:
class
ModelSingleton
:
_instance
=
None
_instance
=
None
_models
=
{}
_models
=
{}
...
@@ -143,17 +141,14 @@ def doc_analyze(
...
@@ -143,17 +141,14 @@ def doc_analyze(
layout_model
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
table_enable
=
None
,
one_shot
:
bool
=
True
,
):
):
end_page_id
=
(
end_page_id
=
(
end_page_id
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
dataset
)
-
1
else
len
(
dataset
)
-
1
)
)
parallel_count
=
None
if
os
.
environ
.
get
(
'MINERU_PARALLEL_INFERENCE_COUNT'
):
parallel_count
=
int
(
os
.
environ
[
'MINERU_PARALLEL_INFERENCE_COUNT'
])
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
...
@@ -163,42 +158,17 @@ def doc_analyze(
...
@@ -163,42 +158,17 @@ def doc_analyze(
images
.
append
(
img_dict
[
'img'
])
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
one_shot
and
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
if
parallel_count
is
None
:
batch_size
=
MIN_BATCH_INFERENCE_SIZE
parallel_count
=
2
# should check the gpu memory firstly !
# split images into parallel_count batches
if
parallel_count
>
1
:
batch_size
=
(
len
(
images
)
+
parallel_count
-
1
)
//
parallel_count
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
else
:
else
:
batch_images
=
[
images
]
batch_images
=
[
images
]
results
=
[]
results
=
[]
parallel_count
=
len
(
batch_images
)
# adjust to real parallel count
for
sn
,
batch_image
in
enumerate
(
batch_images
):
# using concurrent.futures to analyze
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
"""
with fut.ProcessPoolExecutor(max_workers=parallel_count) as executor:
futures = [executor.submit(may_batch_image_analyze, batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)]
for future in fut.as_completed(futures):
sn, result = future.result()
result_history[sn] = result
for key in sorted(result_history.keys()):
results.extend(result_history[key])
"""
results
=
[]
pool
=
mp
.
Pool
(
processes
=
parallel_count
)
mapped_results
=
pool
.
starmap
(
may_batch_image_analyze
,
[(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
for
sn
,
batch_image
in
enumerate
(
batch_images
)])
for
sn
,
result
in
mapped_results
:
results
.
extend
(
result
)
results
.
extend
(
result
)
else
:
_
,
results
=
may_batch_image_analyze
(
images
,
0
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
model_json
=
[]
model_json
=
[]
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
if
start_page_id
<=
index
<=
end_page_id
:
if
start_page_id
<=
index
<=
end_page_id
:
...
@@ -224,11 +194,8 @@ def batch_doc_analyze(
...
@@ -224,11 +194,8 @@ def batch_doc_analyze(
layout_model
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
table_enable
=
None
,
one_shot
:
bool
=
True
,
):
):
parallel_count
=
None
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
if
os
.
environ
.
get
(
'MINERU_PARALLEL_INFERENCE_COUNT'
):
parallel_count
=
int
(
os
.
environ
[
'MINERU_PARALLEL_INFERENCE_COUNT'
])
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
for
dataset
in
datasets
:
for
dataset
in
datasets
:
...
@@ -238,40 +205,17 @@ def batch_doc_analyze(
...
@@ -238,40 +205,17 @@ def batch_doc_analyze(
images
.
append
(
img_dict
[
'img'
])
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
one_shot
and
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
if
parallel_count
is
None
:
batch_size
=
MIN_BATCH_INFERENCE_SIZE
parallel_count
=
2
# should check the gpu memory firstly !
# split images into parallel_count batches
if
parallel_count
>
1
:
batch_size
=
(
len
(
images
)
+
parallel_count
-
1
)
//
parallel_count
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
else
:
else
:
batch_images
=
[
images
]
batch_images
=
[
images
]
results
=
[]
results
=
[]
parallel_count
=
len
(
batch_images
)
# adjust to real parallel count
# using concurrent.futures to analyze
for
sn
,
batch_image
in
enumerate
(
batch_images
):
"""
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
with fut.ProcessPoolExecutor(max_workers=parallel_count) as executor:
futures = [executor.submit(may_batch_image_analyze, batch_image, sn, ocr, show_log, lang, layout_model, formula_enable, table_enable) for sn, batch_image in enumerate(batch_images)]
for future in fut.as_completed(futures):
sn, result = future.result()
result_history[sn] = result
for key in sorted(result_history.keys()):
results.extend(result_history[key])
"""
results
=
[]
pool
=
mp
.
Pool
(
processes
=
parallel_count
)
mapped_results
=
pool
.
starmap
(
may_batch_image_analyze
,
[(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
for
sn
,
batch_image
in
enumerate
(
batch_images
)])
for
sn
,
result
in
mapped_results
:
results
.
extend
(
result
)
results
.
extend
(
result
)
else
:
_
,
results
=
may_batch_image_analyze
(
images
,
0
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
infer_results
=
[]
infer_results
=
[]
from
magic_pdf.operators.models
import
InferenceResult
from
magic_pdf.operators.models
import
InferenceResult
...
...
magic_pdf/tools/common.py
View file @
b3faee93
...
@@ -314,7 +314,7 @@ def batch_do_parse(
...
@@ -314,7 +314,7 @@ def batch_do_parse(
dss
.
append
(
PymuDocDataset
(
v
,
lang
=
lang
))
dss
.
append
(
PymuDocDataset
(
v
,
lang
=
lang
))
else
:
else
:
dss
.
append
(
v
)
dss
.
append
(
v
)
infer_results
=
batch_doc_analyze
(
dss
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
one_shot
=
True
)
infer_results
=
batch_doc_analyze
(
dss
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
for
idx
,
infer_result
in
enumerate
(
infer_results
):
for
idx
,
infer_result
in
enumerate
(
infer_results
):
_do_parse
(
output_dir
,
pdf_file_names
[
idx
],
dss
[
idx
],
infer_result
.
get_infer_res
(),
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
_do_parse
(
output_dir
,
pdf_file_names
[
idx
],
dss
[
idx
],
infer_result
.
get_infer_res
(),
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment