Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
35cb414f
Commit
35cb414f
authored
Jul 01, 2025
by
myhloli
Browse files
feat: integrate LLM optimization for title enhancement in PDF processing
parent
2b17dee1
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
3 deletions
+18
-3
mineru/backend/vlm/token_to_middle_json.py
mineru/backend/vlm/token_to_middle_json.py
+17
-1
mineru/utils/llm_aided.py
mineru/utils/llm_aided.py
+1
-2
No files found.
mineru/backend/vlm/token_to_middle_json.py
View file @
35cb414f
import
re
import
time
from
loguru
import
logger
from
mineru.utils.config_reader
import
get_llm_aided_config
from
mineru.utils.cut_image
import
cut_image_and_table
from
mineru.utils.cut_image
import
cut_image_and_table
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.enum_class
import
BlockType
,
ContentType
from
mineru.utils.hash_utils
import
str_md5
from
mineru.utils.hash_utils
import
str_md5
from
mineru.backend.vlm.vlm_magic_model
import
MagicModel
from
mineru.backend.vlm.vlm_magic_model
import
MagicModel
from
mineru.utils.llm_aided
import
llm_aided_title
from
mineru.version
import
__version__
from
mineru.version
import
__version__
...
@@ -48,6 +51,19 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
...
@@ -48,6 +51,19 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
image_dict
=
images_list
[
index
]
image_dict
=
images_list
[
index
]
page_info
=
token_to_page_info
(
token
,
image_dict
,
page
,
image_writer
,
index
)
page_info
=
token_to_page_info
(
token
,
image_dict
,
page
,
image_writer
,
index
)
middle_json
[
"pdf_info"
].
append
(
page_info
)
middle_json
[
"pdf_info"
].
append
(
page_info
)
"""llm优化"""
llm_aided_config
=
get_llm_aided_config
()
if
llm_aided_config
is
not
None
:
"""标题优化"""
title_aided_config
=
llm_aided_config
.
get
(
'title_aided'
,
None
)
if
title_aided_config
is
not
None
:
if
title_aided_config
.
get
(
'enable'
,
False
):
llm_aided_title_start_time
=
time
.
time
()
llm_aided_title
(
middle_json
[
"pdf_info"
],
title_aided_config
)
logger
.
info
(
f
'llm aided title time:
{
round
(
time
.
time
()
-
llm_aided_title_start_time
,
2
)
}
'
)
# 关闭pdf文档
# 关闭pdf文档
pdf_doc
.
close
()
pdf_doc
.
close
()
return
middle_json
return
middle_json
...
...
mineru/utils/llm_aided.py
View file @
35cb414f
# Copyright (c) Opendatalab. All rights reserved.
# Copyright (c) Opendatalab. All rights reserved.
from
loguru
import
logger
from
loguru
import
logger
from
openai
import
OpenAI
from
openai
import
OpenAI
import
ast
import
json_repair
from
mineru.backend.pipeline.pipeline_middle_json_mkcontent
import
merge_para_with_text
from
mineru.backend.pipeline.pipeline_middle_json_mkcontent
import
merge_para_with_text
...
@@ -91,7 +91,6 @@ Corrected title list:
...
@@ -91,7 +91,6 @@ Corrected title list:
if
"</think>"
in
content
:
if
"</think>"
in
content
:
idx
=
content
.
index
(
"</think>"
)
+
len
(
"</think>"
)
idx
=
content
.
index
(
"</think>"
)
+
len
(
"</think>"
)
content
=
content
[
idx
:].
strip
()
content
=
content
[
idx
:].
strip
()
import
json_repair
dict_completion
=
json_repair
.
loads
(
content
)
dict_completion
=
json_repair
.
loads
(
content
)
dict_completion
=
{
int
(
k
):
int
(
v
)
for
k
,
v
in
dict_completion
.
items
()}
dict_completion
=
{
int
(
k
):
int
(
v
)
for
k
,
v
in
dict_completion
.
items
()}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment