Commit b2bb218c authored by zhougaofeng's avatar zhougaofeng
Browse files

Update ofd_parse.py

parent fb058635
...@@ -8,6 +8,10 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image ...@@ -8,6 +8,10 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
import configparser import configparser
from magic_pdf.parse.pdf_client import ocrPdfClient from magic_pdf.parse.pdf_client import ocrPdfClient
import html import html
import requests
def decode_html_entities(text): def decode_html_entities(text):
# 将 HTML 实体转换为相应的字符 # 将 HTML 实体转换为相应的字符
...@@ -90,40 +94,40 @@ def ofd2img(file_path,output_dir): ...@@ -90,40 +94,40 @@ def ofd2img(file_path,output_dir):
return output_files,pdfbytes return output_files,pdfbytes
def parse_ofd(config_path,file_path,output_dir):
config = configparser.ConfigParser() class ocrOfdClient:
config.read(config_path) def __init__(self, api_url):
url = config.get('server', 'ocr_server') self.api_url = api_url
client = PredictClient(url)
ofd_imgs,pdfbytes = ofd2img(file_path,output_dir) def check_health(self):
# logger.info(f'url:{url}\tofd_img:{ofd_imgs}') health_check_url = f'{self.api_url}/health'
text = '判断图片是否是发票,如果是发票精确提取图片中的内容,否则返回False' try:
ofd_txts = '' response = requests.get(health_check_url)
for ofd_img in ofd_imgs: if response.status_code == 200:
compress_image(ofd_img) logger.info("Server is healthy and ready to process requests.")
res = client.predict(ofd_img,text) return True
if 'False' in res or 'false' in res: else:
ofd_pdf = ofd2pdf(file_path,output_dir,pdfbytes) logger.error(f'Server health check failed with status code:{response.status_code}')
logger.info(f'ofd_pdf:{ofd_pdf}') return False
pdf_server = config.get('server', 'pdf_server') except requests.exceptions.RequestException as e:
pdf_ocr = ocrPdfClient(pdf_server) logger.error(f'Health check request failed:{e}')
ofd_txt = pdf_ocr.ocr_pdf_client(path=ofd_pdf, output_dir=output_dir) return False
break
def parse_ofd(self,config_path,file_path,output_dir):
# 构造请求数据
data = {
"path": str(file_path),
"output_dir": str(output_dir),
"config_path": str(config_path),
}
# 发送 POST 请求
response = requests.post(f"{self.api_url}/ofd_ocr", json=data)
# 处理响应
if response.status_code == 200:
result = response.json()
logger.info(f"文件解析成功,输出路径:{result['output_path']}")
return result['output_path']
else: else:
res = decode_html_entities(res) logger.error(f"文件解析失败,错误信息:{response.json()}")
res = json_to_txt(res)
ofd_txts = ofd_txts + res + '\n'
if ofd_txts != '':
file_name = os.path.basename(file_name).split('.')
ofd_txt = os.path.join(output_dir,file_name) + '.txt'
logger.info(f'ofd_txt:{ofd_txt}')
with open(ofd_txt, 'w', encoding='utf-8') as f:
f.write(str(ofd_txts))
return ofd_txt
#
# if __name__ == '__main__':
# file_path = ''
# out_path = ''
# ofd2pdf()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment