Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
54f065d0
Commit
54f065d0
authored
Jul 05, 2025
by
myhloli
Browse files
refactor: standardize parameter names for formula and table parsing in demo.py
parent
19647ddc
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
4 deletions
+4
-4
demo/demo.py
demo/demo.py
+4
-4
No files found.
demo/demo.py
View file @
54f065d0
...
...
@@ -25,8 +25,8 @@ def do_parse(
p_lang_list
:
list
[
str
],
# List of languages for each PDF, default is 'ch' (Chinese)
backend
=
"pipeline"
,
# The backend for parsing PDF, default is 'pipeline'
parse_method
=
"auto"
,
# The method for parsing PDF, default is 'auto'
p_
formula_enable
=
True
,
# Enable formula parsing
p_
table_enable
=
True
,
# Enable table parsing
formula_enable
=
True
,
# Enable formula parsing
table_enable
=
True
,
# Enable table parsing
server_url
=
None
,
# Server URL for vlm-sglang-client backend
f_draw_layout_bbox
=
True
,
# Whether to draw layout bounding boxes
f_draw_span_bbox
=
True
,
# Whether to draw span bounding boxes
...
...
@@ -45,7 +45,7 @@ def do_parse(
new_pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pypdfium2
(
pdf_bytes
,
start_page_id
,
end_page_id
)
pdf_bytes_list
[
idx
]
=
new_pdf_bytes
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
p_
formula_enable
,
table_enable
=
p_
table_enable
)
infer_results
,
all_image_lists
,
all_pdf_docs
,
lang_list
,
ocr_enabled_list
=
pipeline_doc_analyze
(
pdf_bytes_list
,
p_lang_list
,
parse_method
=
parse_method
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
for
idx
,
model_list
in
enumerate
(
infer_results
):
model_json
=
copy
.
deepcopy
(
model_list
)
...
...
@@ -57,7 +57,7 @@ def do_parse(
pdf_doc
=
all_pdf_docs
[
idx
]
_lang
=
lang_list
[
idx
]
_ocr_enable
=
ocr_enabled_list
[
idx
]
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
p_
formula_enable
)
middle_json
=
pipeline_result_to_middle_json
(
model_list
,
images_list
,
pdf_doc
,
image_writer
,
_lang
,
_ocr_enable
,
formula_enable
)
pdf_info
=
middle_json
[
"pdf_info"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment