Update pdf_server.py

7c7be857 · zhougaofeng · c453d81f · 7c7be857
Commit 7c7be857 authored Nov 15, 2024 by zhougaofeng
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 11 deletions

magic_pdf/tools/pdf_server.py magic_pdf/tools/pdf_server.py +28 -11

No files found.
--- a/magic_pdf/tools/pdf_server.py
+++ b/magic_pdf/tools/pdf_server.py
@@ -18,9 +18,9 @@ import time
 import configparser
 from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
 # from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
-from magic_pdf.parse.pdf_client import ocrPdfClient
 from magic_pdf.parse.ofd_parse import *
 from magic_pdf.tools.ofd_parser import OFDParser
+from magic_pdf.parse.pdf_client import ocrPdfClient


 app = FastAPI()
@@ -168,16 +168,16 @@ async def ofd_ocr(request: ocrRequest):

        # 创建客户端
        client = PredictClient(url)
-        pdf_ocr = ocrPdfClient(pdf_server)
+        # pdf_ocr = ocrPdfClient(pdf_server)

        # 确保输出目录存在
        os.makedirs(request.output_dir, exist_ok=True)

        # 判断 OFD 是否为发票
-        logger.info(f'正在判断ofd文件类型')
+        # logger.info(f'正在判断ofd文件类型')
        check_res,ofd_imgs,pdfbytes = check_ofd(request.path,client,request.output_dir)

-        text = '识别图片的内容，如果是发票就识别图中的文字信息，并以json格式返回'
+        text = '提取图中的文字信息，并以json格式返回'

        # 初始化变量
        ofd_txts = ''
@@ -190,6 +190,7 @@ async def ofd_ocr(request: ocrRequest):
                compress_image(ofd_img)
                res = client.predict(ofd_img, text)
                res = json_to_txt(res)
+                res = decode_html_entities(res)
                ofd_txts += res + '\n'

            # 如果有识别文本，将其写入文件
@@ -201,7 +202,17 @@ async def ofd_ocr(request: ocrRequest):
        else:
            # 否则，将 OFD 转换为 PDF 进行 OCR
            ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes)
-            ofd_txt = pdf_ocr.ocr_pdf_client(request.config_path,path=ofd_pdf, output_dir=request.output_dir)
+            request.path = ofd_pdf
+
+            # logger.info(f'request:{request}')
+            response = await pdf_ocr(request)
+            ofd_txt = response.json()['output_path']
+            ofd_imgs.append(ofd_pdf)
+            for ofd_path in ofd_imgs:
+                if os.path.isfile(ofd_path):
+                    os.remove(ofd_path)
+
+            # ofd_txt = pdf_ocr.ocr_pdf_client(request.config_path,path=ofd_pdf, output_dir=request.output_dir)

        # 返回结果
        if ofd_txt:
@@ -247,9 +258,9 @@ def check_ofd_by_qwen(filepath, client, text,output_dir):
        for ofd_img in ofd_imgs:
            compress_image(ofd_img)
            res = client.predict(ofd_img, text)
-            if 'True' in res:  # 假设返回的结果包含 True 或 False 字符串
-                return True,ofd_imgs, pdfbytes
-        return False,ofd_imgs, pdfbytes
+            if 'False' in res:  # 假设返回的结果包含 True 或 False 字符串
+                return False,ofd_imgs, pdfbytes
+        return True,ofd_imgs, pdfbytes
    except Exception as e:
        logger.error(f"基于 Qwen 判断 OFD 文件时异常: {filepath}，报错：{e}")
        raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误")
@@ -259,10 +270,16 @@ def check_ofd_by_qwen(filepath, client, text,output_dir):
 # 综合判断 OFD 是否为发票
 def check_ofd(filepath,client,output_dir):
    # 首先通过关键词检查
-    if check_ofd_by_keywords(filepath):
+    res_key = check_ofd_by_keywords(filepath)
        # 如果包含所有关键词，进一步使用 Qwen 判断
-        text = '请判断图片是否为发票,如果是发票，请返回"True"，否则返回"False"'
-        res,ofd_imgs, pdfbytes =  check_ofd_by_qwen(filepath, client, text,output_dir)
+
+    text = '请判断图片是否为发票,如果是发票，请返回"True"，否则返回"False"'
+    res_ocr, ofd_imgs, pdfbytes = check_ofd_by_qwen(filepath, client, text, output_dir)
+
+    res = False
+    if res_ocr and res_key:
+        res = True
+
    return res,ofd_imgs, pdfbytes