Update ofd_parse.py

b2bb218c · zhougaofeng · fb058635 · b2bb218c
Commit b2bb218c authored Nov 13, 2024 by zhougaofeng
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 36 deletions

magic_pdf/parse/ofd_parse.py magic_pdf/parse/ofd_parse.py +40 -36

No files found.
--- a/magic_pdf/parse/ofd_parse.py
+++ b/magic_pdf/parse/ofd_parse.py
@@ -8,6 +8,10 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
 import configparser
 from magic_pdf.parse.pdf_client import ocrPdfClient
 import html
+import requests
 def decode_html_entities(text):
    # 将 HTML 实体转换为相应的字符
@@ -90,40 +94,40 @@ def ofd2img(file_path,output_dir):
    return output_files,pdfbytes
-def parse_ofd(config_path,file_path,output_dir):
-    config = configparser.ConfigParser()
+class ocrOfdClient:
-    config.read(config_path)
+    def __init__(self, api_url):
-    url = config.get('server', 'ocr_server')
+        self.api_url = api_url
-    client = PredictClient(url)
-    ofd_imgs,pdfbytes = ofd2img(file_path,output_dir)
+    def check_health(self):
-    # logger.info(f'url:{url}\tofd_img:{ofd_imgs}')
+        health_check_url = f'{self.api_url}/health'
-    text = '判断图片是否是发票，如果是发票精确提取图片中的内容，否则返回False'
+        try:
-    ofd_txts = ''
+            response = requests.get(health_check_url)
-    for ofd_img in ofd_imgs:
+            if response.status_code == 200:
-        compress_image(ofd_img)
+                logger.info("Server is healthy and ready to process requests.")
-        res = client.predict(ofd_img,text)
+                return True
-        if 'False' in res or 'false' in res:
+            else:
-            ofd_pdf = ofd2pdf(file_path,output_dir,pdfbytes)
+                logger.error(f'Server health check failed with status code:{response.status_code}')
-            logger.info(f'ofd_pdf:{ofd_pdf}')
+                return False
-            pdf_server = config.get('server', 'pdf_server')
+        except requests.exceptions.RequestException as e:
-            pdf_ocr = ocrPdfClient(pdf_server)
+            logger.error(f'Health check request failed:{e}')
-            ofd_txt = pdf_ocr.ocr_pdf_client(path=ofd_pdf, output_dir=output_dir)
+            return False
-            break
+    def parse_ofd(self,config_path,file_path,output_dir):
+        # 构造请求数据
+        data = {
+            "path": str(file_path),
+            "output_dir": str(output_dir),
+            "config_path": str(config_path),
+        }
+        # 发送 POST 请求
+        response = requests.post(f"{self.api_url}/ofd_ocr", json=data)
+        # 处理响应
+        if response.status_code == 200:
+            result = response.json()
+            logger.info(f"文件解析成功，输出路径：{result['output_path']}")
+            return result['output_path']
        else:
-            res = decode_html_entities(res)
+            logger.error(f"文件解析失败，错误信息：{response.json()}")
-            res = json_to_txt(res)
-            ofd_txts = ofd_txts + res + '\n'
-    if ofd_txts != '':
-        file_name = os.path.basename(file_name).split('.')
-        ofd_txt = os.path.join(output_dir,file_name) + '.txt'
-        logger.info(f'ofd_txt:{ofd_txt}')
-        with open(ofd_txt, 'w', encoding='utf-8') as f:
-            f.write(str(ofd_txts))
-    return ofd_txt
-#
-# if __name__ == '__main__':
-#     file_path = ''
-#     out_path = ''
-#     ofd2pdf()