Update pdf_server.py

e986ba8a · zhougaofeng · 7b3cb3b2 · e986ba8a
Commit e986ba8a authored Nov 13, 2024 by zhougaofeng
Show whitespace changes
Inline Side-by-side

Showing with 72 additions and 23 deletions

magic_pdf/tools/pdf_server.py magic_pdf/tools/pdf_server.py +72 -23

No files found.
--- a/magic_pdf/tools/pdf_server.py
+++ b/magic_pdf/tools/pdf_server.py
@@ -20,7 +20,7 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
 # from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
 from magic_pdf.parse.pdf_client import ocrPdfClient
 from magic_pdf.parse.ofd_parse import *
-
+from magic_pdf.tools.ofd_parser import OFDParser


 app = FastAPI()
@@ -173,27 +173,22 @@ async def ofd_ocr(request: ocrRequest):
        # 确保输出目录存在
        os.makedirs(request.output_dir, exist_ok=True)

-        # 处理 OFD 文件
-        ofd_imgs, pdfbytes = ofd2img(request.path, request.output_dir)
-        text = '识别图片的内容，如果是发票就执行以下操作识别图中的文字信息，并以json格式返回，如果不是发票返回False'
+        # 判断 OFD 是否为发票
+        logger.info(f'正在判断ofd文件类型')
+        check_res,ofd_imgs,pdfbytes = check_ofd(request.path,client,request.output_dir)
+
+        text = '识别图片的内容，如果是发票就识别图中的文字信息，并以json格式返回'

        # 初始化变量
        ofd_txts = ''
        ofd_txt = ''

-        # 遍历 OFD 图片，逐一进行识别
+        # 判断 OFD 是否为发票
+        if check_res:
+            # 如果是发票，进行 OCR 识别
            for ofd_img in ofd_imgs:
                compress_image(ofd_img)
                res = client.predict(ofd_img, text)
-
-            # 如果识别结果是非发票，则尝试解析 PDF
-            if 'False' in res or 'false' in res:
-                ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes)
-                ofd_txt = pdf_ocr.ocr_pdf_client(path=ofd_pdf, output_dir=request.output_dir)
-                break
-            else:
-                # 处理识别结果
-                res = decode_html_entities(res)
                res = json_to_txt(res)
                ofd_txts += res + '\n'

@@ -203,6 +198,10 @@ async def ofd_ocr(request: ocrRequest):
                ofd_txt = os.path.join(request.output_dir, f"{file_name}.txt")
                with open(ofd_txt, 'w', encoding='utf-8') as f:
                    f.write(ofd_txts)
+        else:
+            # 否则，将 OFD 转换为 PDF 进行 OCR
+            ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes)
+            ofd_txt = pdf_ocr.ocr_pdf_client(request.config_path,path=ofd_pdf, output_dir=request.output_dir)

        # 返回结果
        if ofd_txt:
@@ -217,6 +216,56 @@ async def ofd_ocr(request: ocrRequest):
        raise HTTPException(status_code=500, detail="处理文件时发生错误")


+# 基于关键词判断 OFD 是否为发票
+def check_ofd_by_keywords(filepath):
+    try:
+        with open(filepath, "rb") as f:
+            ofdb64 = str(base64.b64encode(f.read()), "utf-8")
+        res = OFDParser(ofdb64)()  # 假设这是处理 OFD 文件的类
+        invoice_keywords = ['发票代码', '发票号码', '发票', '开票日期']
+
+        # 遍历所有页面并检查关键词
+        for res_info in res:
+            one_res = res_info['page_info']
+            for _ in range(len(one_res)):
+                # print(_['text_list'])
+                # print(one_res[_]['text_list'])
+                text_content = str(one_res[_].get('text_list', ''))
+                if all(keyword in text_content for keyword in invoice_keywords):
+                    # logger.info(f'关键字判断，是发票')
+                    return True
+        return False
+    except Exception as e:
+        logger.error(f"OFD 文件判断异常: {filepath}，报错：{e}")
+        raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误")
+
+
+# 基于深度学习模型（如 Qwen）判断 OFD 是否为发票
+def check_ofd_by_qwen(filepath, client, text,output_dir):
+    try:
+        ofd_imgs, pdfbytes = ofd2img(filepath, output_dir)
+        for ofd_img in ofd_imgs:
+            compress_image(ofd_img)
+            res = client.predict(ofd_img, text)
+            if 'True' in res:  # 假设返回的结果包含 True 或 False 字符串
+                return True,ofd_imgs, pdfbytes
+        return False,ofd_imgs, pdfbytes
+    except Exception as e:
+        logger.error(f"基于 Qwen 判断 OFD 文件时异常: {filepath}，报错：{e}")
+        raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误")
+
+
+
+# 综合判断 OFD 是否为发票
+def check_ofd(filepath,client,output_dir):
+    # 首先通过关键词检查
+    if check_ofd_by_keywords(filepath):
+        # 如果包含所有关键词，进一步使用 Qwen 判断
+        text = '请判断图片是否为发票,如果是发票，请返回"True"，否则返回"False"'
+        res,ofd_imgs, pdfbytes =  check_ofd_by_qwen(filepath, client, text,output_dir)
+    return res,ofd_imgs, pdfbytes
+
+
 def main():
    args = parse_args()
    ocr_pdf_serve(args)