Commit 7c7be857 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update pdf_server.py

parent c453d81f
...@@ -18,9 +18,9 @@ import time ...@@ -18,9 +18,9 @@ import time
import configparser import configparser
from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
# from magic_pdf.dict2md.ocr_client import PredictClient,compress_image # from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
from magic_pdf.parse.pdf_client import ocrPdfClient
from magic_pdf.parse.ofd_parse import * from magic_pdf.parse.ofd_parse import *
from magic_pdf.tools.ofd_parser import OFDParser from magic_pdf.tools.ofd_parser import OFDParser
from magic_pdf.parse.pdf_client import ocrPdfClient
app = FastAPI() app = FastAPI()
...@@ -168,16 +168,16 @@ async def ofd_ocr(request: ocrRequest): ...@@ -168,16 +168,16 @@ async def ofd_ocr(request: ocrRequest):
# 创建客户端 # 创建客户端
client = PredictClient(url) client = PredictClient(url)
pdf_ocr = ocrPdfClient(pdf_server) # pdf_ocr = ocrPdfClient(pdf_server)
# 确保输出目录存在 # 确保输出目录存在
os.makedirs(request.output_dir, exist_ok=True) os.makedirs(request.output_dir, exist_ok=True)
# 判断 OFD 是否为发票 # 判断 OFD 是否为发票
logger.info(f'正在判断ofd文件类型') # logger.info(f'正在判断ofd文件类型')
check_res,ofd_imgs,pdfbytes = check_ofd(request.path,client,request.output_dir) check_res,ofd_imgs,pdfbytes = check_ofd(request.path,client,request.output_dir)
text = '识别图片的内容,如果是发票就识别图中的文字信息,并以json格式返回' text = '提取图中的文字信息,并以json格式返回'
# 初始化变量 # 初始化变量
ofd_txts = '' ofd_txts = ''
...@@ -190,6 +190,7 @@ async def ofd_ocr(request: ocrRequest): ...@@ -190,6 +190,7 @@ async def ofd_ocr(request: ocrRequest):
compress_image(ofd_img) compress_image(ofd_img)
res = client.predict(ofd_img, text) res = client.predict(ofd_img, text)
res = json_to_txt(res) res = json_to_txt(res)
res = decode_html_entities(res)
ofd_txts += res + '\n' ofd_txts += res + '\n'
# 如果有识别文本,将其写入文件 # 如果有识别文本,将其写入文件
...@@ -201,7 +202,17 @@ async def ofd_ocr(request: ocrRequest): ...@@ -201,7 +202,17 @@ async def ofd_ocr(request: ocrRequest):
else: else:
# 否则,将 OFD 转换为 PDF 进行 OCR # 否则,将 OFD 转换为 PDF 进行 OCR
ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes) ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes)
ofd_txt = pdf_ocr.ocr_pdf_client(request.config_path,path=ofd_pdf, output_dir=request.output_dir) request.path = ofd_pdf
# logger.info(f'request:{request}')
response = await pdf_ocr(request)
ofd_txt = response.json()['output_path']
ofd_imgs.append(ofd_pdf)
for ofd_path in ofd_imgs:
if os.path.isfile(ofd_path):
os.remove(ofd_path)
# ofd_txt = pdf_ocr.ocr_pdf_client(request.config_path,path=ofd_pdf, output_dir=request.output_dir)
# 返回结果 # 返回结果
if ofd_txt: if ofd_txt:
...@@ -247,9 +258,9 @@ def check_ofd_by_qwen(filepath, client, text,output_dir): ...@@ -247,9 +258,9 @@ def check_ofd_by_qwen(filepath, client, text,output_dir):
for ofd_img in ofd_imgs: for ofd_img in ofd_imgs:
compress_image(ofd_img) compress_image(ofd_img)
res = client.predict(ofd_img, text) res = client.predict(ofd_img, text)
if 'True' in res: # 假设返回的结果包含 True 或 False 字符串 if 'False' in res: # 假设返回的结果包含 True 或 False 字符串
return True,ofd_imgs, pdfbytes
return False,ofd_imgs, pdfbytes return False,ofd_imgs, pdfbytes
return True,ofd_imgs, pdfbytes
except Exception as e: except Exception as e:
logger.error(f"基于 Qwen 判断 OFD 文件时异常: {filepath},报错:{e}") logger.error(f"基于 Qwen 判断 OFD 文件时异常: {filepath},报错:{e}")
raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误") raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误")
...@@ -259,10 +270,16 @@ def check_ofd_by_qwen(filepath, client, text,output_dir): ...@@ -259,10 +270,16 @@ def check_ofd_by_qwen(filepath, client, text,output_dir):
# 综合判断 OFD 是否为发票 # 综合判断 OFD 是否为发票
def check_ofd(filepath,client,output_dir): def check_ofd(filepath,client,output_dir):
# 首先通过关键词检查 # 首先通过关键词检查
if check_ofd_by_keywords(filepath): res_key = check_ofd_by_keywords(filepath)
# 如果包含所有关键词,进一步使用 Qwen 判断 # 如果包含所有关键词,进一步使用 Qwen 判断
text = '请判断图片是否为发票,如果是发票,请返回"True",否则返回"False"' text = '请判断图片是否为发票,如果是发票,请返回"True",否则返回"False"'
res,ofd_imgs, pdfbytes = check_ofd_by_qwen(filepath, client, text,output_dir) res_ocr, ofd_imgs, pdfbytes = check_ofd_by_qwen(filepath, client, text, output_dir)
res = False
if res_ocr and res_key:
res = True
return res,ofd_imgs, pdfbytes return res,ofd_imgs, pdfbytes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment