"tests/vscode:/vscode.git/clone" did not exist on "73acebb8cfbd1d2954cabe1af4185f9994e61917"
Commit 2d742bca authored by myhloli's avatar myhloli
Browse files

feat: improve heading level feature with enhanced configuration and error handling

parent 5f4f5174
...@@ -11,13 +11,18 @@ from mineru.utils.pdf_image_tools import get_crop_img ...@@ -11,13 +11,18 @@ from mineru.utils.pdf_image_tools import get_crop_img
from mineru.version import __version__ from mineru.version import __version__
heading_level_import_success = False heading_level_import_success = False
try: llm_aided_config = get_llm_aided_config()
if llm_aided_config is not None:
title_aided_config = llm_aided_config.get('title_aided', None)
if title_aided_config is not None:
if title_aided_config.get('enable', False):
try:
from mineru.utils.llm_aided import llm_aided_title from mineru.utils.llm_aided import llm_aided_title
from mineru.backend.pipeline.model_init import AtomModelSingleton from mineru.backend.pipeline.model_init import AtomModelSingleton
heading_level_import_success = True heading_level_import_success = True
except Exception as e: except Exception as e:
logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, " logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
"please execute `pip install mineru[pipeline]` to install the required packages.") "please execute `pip install mineru[core]` to install the required packages.")
def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict: def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict:
...@@ -38,11 +43,6 @@ def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dic ...@@ -38,11 +43,6 @@ def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dic
title_blocks = magic_model.get_title_blocks() title_blocks = magic_model.get_title_blocks()
# 如果有标题优化需求,则对title_blocks截图det # 如果有标题优化需求,则对title_blocks截图det
llm_aided_config = get_llm_aided_config()
if llm_aided_config is not None:
title_aided_config = llm_aided_config.get('title_aided', None)
if title_aided_config is not None:
if title_aided_config.get('enable', False):
if heading_level_import_success: if heading_level_import_success:
atom_model_manager = AtomModelSingleton() atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model( ocr_model = atom_model_manager.get_atom_model(
...@@ -91,13 +91,7 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer): ...@@ -91,13 +91,7 @@ def result_to_middle_json(token_list, images_list, pdf_doc, image_writer):
page_info = token_to_page_info(token, image_dict, page, image_writer, index) page_info = token_to_page_info(token, image_dict, page, image_writer, index)
middle_json["pdf_info"].append(page_info) middle_json["pdf_info"].append(page_info)
"""llm优化""" """llm优化标题分级"""
llm_aided_config = get_llm_aided_config()
if llm_aided_config is not None:
"""标题优化"""
title_aided_config = llm_aided_config.get('title_aided', None)
if title_aided_config is not None:
if title_aided_config.get('enable', False):
if heading_level_import_success: if heading_level_import_success:
llm_aided_title_start_time = time.time() llm_aided_title_start_time = time.time()
llm_aided_title(middle_json["pdf_info"], title_aided_config) llm_aided_title(middle_json["pdf_info"], title_aided_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment