Commit c0a9d1c7 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update ocr_mkcontent.py

parent d4e904ba
...@@ -17,9 +17,8 @@ from magic_pdf.libs.ocr_content_type import BlockType, ContentType ...@@ -17,9 +17,8 @@ from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# 普通 非vllm # 普通 非vllm
from magic_pdf.dict2md.ocr_client import PredictClient,compress_image from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
client = None
status = None
def __is_hyphen_at_line_end(line): def __is_hyphen_at_line_end(line):
""" """
...@@ -127,21 +126,12 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''): ...@@ -127,21 +126,12 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
return page_markdown return page_markdown
def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout, def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode, mode,
img_buket_path=''): img_buket_path=''):
page_markdown = [] page_markdown = []
config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
# logger.info(f'ocr_server:{url}')
client = PredictClient(url)
status = client.check_health()
if not status:
pdf_ocr = None
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
return None
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_text = '' para_text = ''
para_type = para_block['type'] para_type = para_block['type']
...@@ -434,12 +424,24 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list): ...@@ -434,12 +424,24 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
return content_list return content_list
def union_make(config_path: str, def union_make(
config_path: str,
pdf_info_dict: list, pdf_info_dict: list,
make_mode: str, make_mode: str,
drop_mode: str, drop_mode: str,
img_buket_path: str = ''): img_buket_path: str = ''):
output_content = [] output_content = []
global client
global status
config = configparser.ConfigParser()
config.read(config_path)
url = config.get('server', 'ocr_server')
# logger.info(f'ocr_server:{url}')
client = PredictClient(url)
status = client.check_health()
if not status:
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
if page_info.get('need_drop', False): if page_info.get('need_drop', False):
drop_reason = page_info.get('drop_reason') drop_reason = page_info.get('drop_reason')
...@@ -462,11 +464,11 @@ def union_make(config_path: str, ...@@ -462,11 +464,11 @@ def union_make(config_path: str,
continue continue
if make_mode == MakeMode.MM_MD: if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
config_path,paras_of_layout, 'mm', img_buket_path) paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD: elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2( page_markdown = ocr_mk_markdown_with_para_core_v2(
config_path,paras_of_layout, 'nlp') paras_of_layout, 'nlp')
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout: for para_block in paras_of_layout:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment