Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
2d742bca
Commit
2d742bca
authored
Jul 11, 2025
by
myhloli
Browse files
feat: improve heading level feature with enhanced configuration and error handling
parent
5f4f5174
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
38 additions
and
44 deletions
+38
-44
mineru/backend/vlm/token_to_middle_json.py
mineru/backend/vlm/token_to_middle_json.py
+38
-44
No files found.
mineru/backend/vlm/token_to_middle_json.py
View file @
2d742bca
...
...
@@ -11,13 +11,18 @@ from mineru.utils.pdf_image_tools import get_crop_img
from
mineru.version
import
__version__
heading_level_import_success
=
False
try
:
llm_aided_config
=
get_llm_aided_config
()
if
llm_aided_config
is
not
None
:
title_aided_config
=
llm_aided_config
.
get
(
'title_aided'
,
None
)
if
title_aided_config
is
not
None
:
if
title_aided_config
.
get
(
'enable'
,
False
):
try
:
from
mineru.utils.llm_aided
import
llm_aided_title
from
mineru.backend.pipeline.model_init
import
AtomModelSingleton
heading_level_import_success
=
True
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
warning
(
"The heading level feature cannot be used. If you need to use the heading level feature, "
"please execute `pip install mineru[
pipelin
e]` to install the required packages."
)
"please execute `pip install mineru[
cor
e]` to install the required packages."
)
def
token_to_page_info
(
token
,
image_dict
,
page
,
image_writer
,
page_index
)
->
dict
:
...
...
@@ -38,11 +43,6 @@ def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dic
title_blocks
=
magic_model
.
get_title_blocks
()
# 如果有标题优化需求,则对title_blocks截图det
llm_aided_config
=
get_llm_aided_config
()
if
llm_aided_config
is
not
None
:
title_aided_config
=
llm_aided_config
.
get
(
'title_aided'
,
None
)
if
title_aided_config
is
not
None
:
if
title_aided_config
.
get
(
'enable'
,
False
):
if
heading_level_import_success
:
atom_model_manager
=
AtomModelSingleton
()
ocr_model
=
atom_model_manager
.
get_atom_model
(
...
...
@@ -91,13 +91,7 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
page_info
=
token_to_page_info
(
token
,
image_dict
,
page
,
image_writer
,
index
)
middle_json
[
"pdf_info"
].
append
(
page_info
)
"""llm优化"""
llm_aided_config
=
get_llm_aided_config
()
if
llm_aided_config
is
not
None
:
"""标题优化"""
title_aided_config
=
llm_aided_config
.
get
(
'title_aided'
,
None
)
if
title_aided_config
is
not
None
:
if
title_aided_config
.
get
(
'enable'
,
False
):
"""llm优化标题分级"""
if
heading_level_import_success
:
llm_aided_title_start_time
=
time
.
time
()
llm_aided_title
(
middle_json
[
"pdf_info"
],
title_aided_config
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment