Commit 0a468eca authored by myhloli's avatar myhloli
Browse files

feat(llm_aided): add title optimization feature

- Implement llm_aided_title function to optimize document titles using LLM
- Update pdf_parse_union_core_v2.py to include title optimization
- Modify ocr_mkcontent.py to use optimized title levels- Add openai SDK dependency in setup.py
parent da3257a6
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
"formula_aided": { "formula_aided": {
"api_key": "your_api_key", "api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-72b-instruct", "model": "qwen2.5-7b-instruct",
"enable": false "enable": false
}, },
"text_aided": { "text_aided": {
...@@ -31,6 +31,12 @@ ...@@ -31,6 +31,12 @@
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-7b-instruct", "model": "qwen2.5-7b-instruct",
"enable": false "enable": false
},
"title_aided": {
"api_key": "your_api_key",
"base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1",
"model": "qwen2.5-32b-instruct",
"enable": false
} }
}, },
"config_version": "1.1.0" "config_version": "1.1.0"
......
...@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, ...@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
para_text = merge_para_with_text(para_block) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_text = f'# {merge_para_with_text(para_block)}' title_level = get_title_level(para_block)
para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
...@@ -186,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason ...@@ -186,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
} }
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
title_level = get_title_level(para_block)
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'text_level': 1, 'text_level': title_level,
} }
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_content = { para_content = {
...@@ -289,3 +291,12 @@ def union_make(pdf_info_dict: list, ...@@ -289,3 +291,12 @@ def union_make(pdf_info_dict: list,
return '\n\n'.join(output_content) return '\n\n'.join(output_content)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
return output_content return output_content
def get_title_level(block):
title_level = block.get('level', 1)
if title_level > 4:
title_level = 4
elif title_level < 1:
title_level = 1
return title_level
\ No newline at end of file
...@@ -19,7 +19,7 @@ from magic_pdf.libs.convert_utils import dict_to_list ...@@ -19,7 +19,7 @@ from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
try: try:
import torchtext import torchtext
...@@ -846,6 +846,10 @@ def pdf_parse_union( ...@@ -846,6 +846,10 @@ def pdf_parse_union(
text_aided_config = llm_aided_config.get('text_aided', None) text_aided_config = llm_aided_config.get('text_aided', None)
if text_aided_config is not None: if text_aided_config is not None:
llm_aided_text(pdf_info_dict, text_aided_config) llm_aided_text(pdf_info_dict, text_aided_config)
"""标题优化"""
title_aided_config = llm_aided_config.get('title_aided', None)
if title_aided_config is not None:
llm_aided_title(pdf_info_dict, title_aided_config)
"""dict转list""" """dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict) pdf_info_list = dict_to_list(pdf_info_dict)
......
# Copyright (c) Opendatalab. All rights reserved. # Copyright (c) Opendatalab. All rights reserved.
import json
from loguru import logger
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from openai import OpenAI
formula_correction_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
1. 修正渲染或编译错误: 1. 修正渲染或编译错误:
- Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles. - Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
...@@ -18,7 +23,7 @@ $FORMULA ...@@ -18,7 +23,7 @@ $FORMULA
Your corrected result: Your corrected result:
""" """
text_correction_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容: text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容:
1. 修正OCR引起的拼写错误和错误: 1. 修正OCR引起的拼写错误和错误:
- 修正常见的OCR错误(例如,'rn' 被误读为 'm') - 修正常见的OCR错误(例如,'rn' 被误读为 'm')
...@@ -61,4 +66,67 @@ def llm_aided_formula(pdf_info_dict, formula_aided_config): ...@@ -61,4 +66,67 @@ def llm_aided_formula(pdf_info_dict, formula_aided_config):
pass pass
def llm_aided_text(pdf_info_dict, text_aided_config): def llm_aided_text(pdf_info_dict, text_aided_config):
pass pass
\ No newline at end of file
def llm_aided_title(pdf_info_dict, title_aided_config):
client = OpenAI(
api_key=title_aided_config["api_key"],
base_url=title_aided_config["base_url"],
)
title_dict = {}
origin_title_list = []
i = 0
for page_num, page in pdf_info_dict.items():
blocks = page["para_blocks"]
for block in blocks:
if block["type"] == "title":
origin_title_list.append(block)
title_text = merge_para_with_text(block)
title_dict[f"{i}"] = title_text
i += 1
logger.info(f"Title list: {title_dict}")
title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构:
1. 保留原始内容:
- 输入的字典中所有元素都是有效的,不能删除字典中的任何元素
- 请务必保证输出的字典中元素的数量和输入的数量一致
2. 保持字典内key-value的对应关系不变
3. 优化层次结构:
- 为每个标题元素添加适当的层次结构
- 标题层级应具有连续性,不能跳过某一层级
- 标题层级最多为4级,不要添加过多的层级
- 优化后的标题为一个整数,代表该标题的层级
IMPORTANT:
请直接返回优化过的由标题层级组成的json,返回的json不需要格式化。
Input title list:
{title_dict}
Corrected title list:
"""
completion = client.chat.completions.create(
model=title_aided_config["model"],
messages=[
{'role': 'user', 'content': title_optimize_prompt}],
temperature=0.7,
)
json_completion = json.loads(completion.choices[0].message.content)
logger.info(f"Title completion: {json_completion}")
logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
if len(json_completion) == len(title_dict):
try:
for i, origin_title_block in enumerate(origin_title_list):
origin_title_block["level"] = int(json_completion[str(i)])
except Exception as e:
logger.exception(e)
else:
logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
...@@ -52,6 +52,7 @@ if __name__ == '__main__': ...@@ -52,6 +52,7 @@ if __name__ == '__main__':
"rapidocr-paddle", # rapidocr-paddle "rapidocr-paddle", # rapidocr-paddle
"rapid_table", # rapid_table "rapid_table", # rapid_table
"PyYAML", # yaml "PyYAML", # yaml
"openai", # openai SDK
"detectron2" "detectron2"
], ],
"old_linux":[ "old_linux":[
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment