Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
adbf4921
"scripts/git@developer.sourcefind.cn:change/sglang.git" did not exist on "9c58e68b4c18a2d0876b9c2b8706cd6e783e4e1c"
Commit
adbf4921
authored
Mar 24, 2025
by
icecraft
Browse files
fix: support auto method and auto lang
parent
eb02736a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
13 deletions
+50
-13
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+4
-1
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+31
-9
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+15
-3
No files found.
magic_pdf/data/dataset.py
View file @
adbf4921
...
@@ -143,6 +143,7 @@ class PymuDocDataset(Dataset):
...
@@ -143,6 +143,7 @@ class PymuDocDataset(Dataset):
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_data_bits
=
bits
self
.
_data_bits
=
bits
self
.
_raw_data
=
bits
self
.
_raw_data
=
bits
self
.
_classify_result
=
None
if
lang
==
''
:
if
lang
==
''
:
self
.
_lang
=
None
self
.
_lang
=
None
...
@@ -218,7 +219,9 @@ class PymuDocDataset(Dataset):
...
@@ -218,7 +219,9 @@ class PymuDocDataset(Dataset):
Returns:
Returns:
SupportedPdfParseMethod: _description_
SupportedPdfParseMethod: _description_
"""
"""
return
classify
(
self
.
_data_bits
)
if
self
.
_classify_result
is
None
:
self
.
_classify_result
=
classify
(
self
.
_data_bits
)
return
self
.
_classify_result
def
clone
(
self
):
def
clone
(
self
):
"""clone this dataset."""
"""clone this dataset."""
...
...
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
adbf4921
...
@@ -189,26 +189,48 @@ def batch_doc_analyze(
...
@@ -189,26 +189,48 @@ def batch_doc_analyze(
table_enable
=
None
,
table_enable
=
None
,
):
):
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
batch_size
=
MIN_BATCH_INFERENCE_SIZE
images
=
[]
images
=
[]
page_wh_list
=
[]
page_wh_list
=
[]
lang_list
=
[]
lang_s
=
set
()
for
dataset
in
datasets
:
for
dataset
in
datasets
:
for
index
in
range
(
len
(
dataset
)):
for
index
in
range
(
len
(
dataset
)):
if
lang
is
None
or
lang
==
'auto'
:
lang_list
.
append
(
dataset
.
_lang
)
else
:
lang_list
.
append
(
lang
)
lang_s
.
add
(
lang_list
[
-
1
])
page_data
=
dataset
.
get_page
(
index
)
page_data
=
dataset
.
get_page
(
index
)
img_dict
=
page_data
.
get_image
()
img_dict
=
page_data
.
get_image
()
images
.
append
(
img_dict
[
'img'
])
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
batch_images
=
[]
batch_size
=
MIN_BATCH_INFERENCE_SIZE
img_idx_list
=
[]
batch_images
=
[
images
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images
),
batch_size
)]
for
t_lang
in
lang_s
:
else
:
tmp_img_idx_list
=
[]
batch_images
=
[
images
]
for
i
,
_lang
in
enumerate
(
lang_list
):
if
_lang
==
t_lang
:
tmp_img_idx_list
.
append
(
i
)
img_idx_list
.
extend
(
tmp_img_idx_list
)
if
batch_size
>=
len
(
tmp_img_idx_list
):
batch_images
.
append
((
t_lang
,
[
images
[
j
]
for
j
in
tmp_img_idx_list
]))
else
:
slices
=
[
tmp_img_idx_list
[
k
:
k
+
batch_size
]
for
k
in
range
(
0
,
len
(
tmp_img_idx_list
),
batch_size
)]
for
arr
in
slices
:
batch_images
.
append
((
t_lang
,
[
images
[
j
]
for
j
in
arr
]))
results
=
[]
unorder_results
=
[]
for
sn
,
(
_lang
,
batch_image
)
in
enumerate
(
batch_images
):
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
_lang
,
layout_model
,
formula_enable
,
table_enable
)
unorder_results
.
extend
(
result
)
results
=
[
None
]
*
len
(
img_idx_list
)
for
i
,
idx
in
enumerate
(
img_idx_list
):
results
[
idx
]
=
unorder_results
[
i
]
for
sn
,
batch_image
in
enumerate
(
batch_images
):
_
,
result
=
may_batch_image_analyze
(
batch_image
,
sn
,
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
infer_results
=
[]
infer_results
=
[]
from
magic_pdf.operators.models
import
InferenceResult
from
magic_pdf.operators.models
import
InferenceResult
...
...
magic_pdf/tools/common.py
View file @
adbf4921
...
@@ -281,7 +281,7 @@ def do_parse(
...
@@ -281,7 +281,7 @@ def do_parse(
ds
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
ds
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
else
:
else
:
ds
=
pdf_bytes_or_dataset
ds
=
pdf_bytes_or_dataset
batch_do_parse
(
output_dir
,
[
pdf_file_name
],
[
ds
],
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
batch_do_parse
(
output_dir
,
[
pdf_file_name
],
[
ds
],
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
,
lang
=
lang
)
else
:
else
:
_do_parse
(
output_dir
,
pdf_file_name
,
pdf_bytes_or_dataset
,
model_list
,
parse_method
,
debug_able
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
_do_parse
(
output_dir
,
pdf_file_name
,
pdf_bytes_or_dataset
,
model_list
,
parse_method
,
debug_able
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
...
@@ -314,9 +314,21 @@ def batch_do_parse(
...
@@ -314,9 +314,21 @@ def batch_do_parse(
dss
.
append
(
PymuDocDataset
(
v
,
lang
=
lang
))
dss
.
append
(
PymuDocDataset
(
v
,
lang
=
lang
))
else
:
else
:
dss
.
append
(
v
)
dss
.
append
(
v
)
infer_results
=
batch_doc_analyze
(
dss
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
dss_with_fn
=
list
(
zip
(
dss
,
pdf_file_names
))
if
parse_method
==
'auto'
:
dss_typed_txt
=
[(
i
,
x
)
for
i
,
x
in
enumerate
(
dss_with_fn
)
if
x
[
0
].
classify
()
==
SupportedPdfParseMethod
.
TXT
]
dss_typed_ocr
=
[(
i
,
x
)
for
i
,
x
in
enumerate
(
dss_with_fn
)
if
x
[
0
].
classify
()
==
SupportedPdfParseMethod
.
OCR
]
infer_results
=
[
None
]
*
len
(
dss_with_fn
)
infer_results_txt
=
batch_doc_analyze
([
x
[
1
][
0
]
for
x
in
dss_typed_txt
],
lang
=
lang
,
ocr
=
False
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
infer_results_ocr
=
batch_doc_analyze
([
x
[
1
][
0
]
for
x
in
dss_typed_ocr
],
lang
=
lang
,
ocr
=
True
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
for
i
,
infer_res
in
enumerate
(
infer_results_txt
):
infer_results
[
dss_typed_txt
[
i
][
0
]]
=
infer_res
for
i
,
infer_res
in
enumerate
(
infer_results_ocr
):
infer_results
[
dss_typed_ocr
[
i
][
0
]]
=
infer_res
else
:
infer_results
=
batch_doc_analyze
(
dss
,
lang
=
lang
,
ocr
=
parse_method
==
'ocr'
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
for
idx
,
infer_result
in
enumerate
(
infer_results
):
for
idx
,
infer_result
in
enumerate
(
infer_results
):
_do_parse
(
output_dir
,
pdf_file_names
[
idx
],
dss
[
idx
],
infer_result
.
get_infer_res
(),
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
_do_parse
(
output_dir
,
dss_with_fn
[
idx
]
[
1
]
,
dss
_with_fn
[
idx
]
[
0
]
,
infer_result
.
get_infer_res
(),
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
,
lang
=
lang
)
parse_pdf_methods
=
click
.
Choice
([
'ocr'
,
'txt'
,
'auto'
])
parse_pdf_methods
=
click
.
Choice
([
'ocr'
,
'txt'
,
'auto'
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment