Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
20a623ac
Unverified
Commit
20a623ac
authored
Feb 21, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Feb 21, 2025
Browse files
Merge pull request #1734 from myhloli/dev
refactor(magic_pdf): improve title optimization process
parents
e0f591ec
54940c61
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
16 deletions
+14
-16
magic_pdf/post_proc/llm_aided.py
magic_pdf/post_proc/llm_aided.py
+14
-16
No files found.
magic_pdf/post_proc/llm_aided.py
View file @
20a623ac
...
@@ -3,6 +3,7 @@ import json
...
@@ -3,6 +3,7 @@ import json
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.dict2md.ocr_mkcontent
import
merge_para_with_text
from
magic_pdf.dict2md.ocr_mkcontent
import
merge_para_with_text
from
openai
import
OpenAI
from
openai
import
OpenAI
import
ast
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
...
@@ -119,11 +120,12 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
...
@@ -119,11 +120,12 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
- 在完成初步分级后,仔细检查分级结果的合理性
- 在完成初步分级后,仔细检查分级结果的合理性
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
- 确保最终的分级结果符合文档的实际结构和逻辑
- 确保最终的分级结果符合文档的实际结构和逻辑
- 字典中包含OCR错误识别的标题,你可以通过将其层级标记为 0 来排除它们
IMPORTANT:
IMPORTANT:
请直接返回优化过的由标题层级组成的
json,格式
如下:
请直接返回优化过的由标题层级组成的
字典,格式为{{标题id:标题层级}},
如下:
{{
"0":1,"1":2,"2":2,"3"
:3}}
{{
0:1,1:2,2:2,3
:3}}
返回的json不需要格式化
。
不需要对字典格式化,不需要返回任何其他信息
。
Input title list:
Input title list:
{
title_dict
}
{
title_dict
}
...
@@ -133,7 +135,7 @@ Corrected title list:
...
@@ -133,7 +135,7 @@ Corrected title list:
retry_count
=
0
retry_count
=
0
max_retries
=
3
max_retries
=
3
json
_completion
=
None
dict
_completion
=
None
while
retry_count
<
max_retries
:
while
retry_count
<
max_retries
:
try
:
try
:
...
@@ -143,24 +145,20 @@ Corrected title list:
...
@@ -143,24 +145,20 @@ Corrected title list:
{
'role'
:
'user'
,
'content'
:
title_optimize_prompt
}],
{
'role'
:
'user'
,
'content'
:
title_optimize_prompt
}],
temperature
=
0.7
,
temperature
=
0.7
,
)
)
json_completion
=
json
.
loads
(
completion
.
choices
[
0
].
message
.
content
)
# logger.info(f"Title completion: {completion.choices[0].message.content}")
dict_completion
=
ast
.
literal_eval
(
completion
.
choices
[
0
].
message
.
content
)
# logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
# logger.info(f"Title completion: {json_completion}")
if
len
(
dict_completion
)
==
len
(
title_dict
):
# logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
if
len
(
json_completion
)
==
len
(
title_dict
):
for
i
,
origin_title_block
in
enumerate
(
origin_title_list
):
for
i
,
origin_title_block
in
enumerate
(
origin_title_list
):
origin_title_block
[
"level"
]
=
int
(
json
_completion
[
str
(
i
)
])
origin_title_block
[
"level"
]
=
int
(
dict
_completion
[
i
])
break
break
else
:
else
:
logger
.
warning
(
"The number of titles in the optimized result is not equal to the number of titles in the input."
)
logger
.
warning
(
"The number of titles in the optimized result is not equal to the number of titles in the input."
)
retry_count
+=
1
retry_count
+=
1
except
Exception
as
e
:
except
Exception
as
e
:
if
isinstance
(
e
,
json
.
decoder
.
JSONDecodeError
):
logger
.
warning
(
f
"JSON decode error on attempt
{
retry_count
+
1
}
:
{
e
}
"
)
else
:
logger
.
exception
(
e
)
logger
.
exception
(
e
)
retry_count
+=
1
retry_count
+=
1
if
json
_completion
is
None
:
if
dict
_completion
is
None
:
logger
.
error
(
"Failed to decode
JSON
after maximum retries."
)
logger
.
error
(
"Failed to decode
dict
after maximum retries."
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment