Commit 9ab44892 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update ocr_mkcontent.py

parent 304bd577
......@@ -11,7 +11,7 @@ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# import pypandoc
from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
......@@ -130,6 +130,12 @@ def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
url = config.get('server', 'ocr_server')
# logger.info(f'ocr_server:{url}')
client = PredictClient(url)
status = PredictClient.check_health()
if not status:
pdf_ocr = None
logger.warning(f'Health check failed. The server at "{url}" is not responding as expected.')
logger.info(f'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务')
return None
for para_block in paras_of_layout:
para_text = ''
para_type = para_block['type']
......@@ -178,14 +184,17 @@ def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
else:
# 处理图片
# para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n"
text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
start = time.time()
image_path = join_path(img_buket_path, span['image_path'])
compress_image(image_path)
generated_text = client.predict(image_path, text)
end = time.time()
logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end-start}')
para_text += generated_text
if status:
text = '解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
start = time.time()
image_path = join_path(img_buket_path, span['image_path'])
compress_image(image_path)
generated_text = client.predict(image_path, text)
end = time.time()
logger.info(f'qwen解析{image_path}表格的内容为:{generated_text},耗时为:{end-start}')
para_text += generated_text
else:
para_text += f"----------------图片路径为({join_path(img_buket_path, span['image_path'])}),请检查qwen ocr服务,重新运行文件解析------------------- \n"
for block in para_block['blocks']: # 3rd.拼table_footnote
if block['type'] == BlockType.TableFootnote:
para_text += merge_para_with_text(block)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment