Commit e986ba8a authored by zhougaofeng's avatar zhougaofeng
Browse files

Update pdf_server.py

parent 7b3cb3b2
......@@ -20,7 +20,7 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
# from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
from magic_pdf.parse.pdf_client import ocrPdfClient
from magic_pdf.parse.ofd_parse import *
from magic_pdf.tools.ofd_parser import OFDParser
app = FastAPI()
......@@ -173,27 +173,22 @@ async def ofd_ocr(request: ocrRequest):
# 确保输出目录存在
os.makedirs(request.output_dir, exist_ok=True)
# 处理 OFD 文件
ofd_imgs, pdfbytes = ofd2img(request.path, request.output_dir)
text = '识别图片的内容,如果是发票就执行以下操作识别图中的文字信息,并以json格式返回,如果不是发票返回False'
# 判断 OFD 是否为发票
logger.info(f'正在判断ofd文件类型')
check_res,ofd_imgs,pdfbytes = check_ofd(request.path,client,request.output_dir)
text = '识别图片的内容,如果是发票就识别图中的文字信息,并以json格式返回'
# 初始化变量
ofd_txts = ''
ofd_txt = ''
# 遍历 OFD 图片,逐一进行识别
# 判断 OFD 是否为发票
if check_res:
# 如果是发票,进行 OCR 识别
for ofd_img in ofd_imgs:
compress_image(ofd_img)
res = client.predict(ofd_img, text)
# 如果识别结果是非发票,则尝试解析 PDF
if 'False' in res or 'false' in res:
ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes)
ofd_txt = pdf_ocr.ocr_pdf_client(path=ofd_pdf, output_dir=request.output_dir)
break
else:
# 处理识别结果
res = decode_html_entities(res)
res = json_to_txt(res)
ofd_txts += res + '\n'
......@@ -203,6 +198,10 @@ async def ofd_ocr(request: ocrRequest):
ofd_txt = os.path.join(request.output_dir, f"{file_name}.txt")
with open(ofd_txt, 'w', encoding='utf-8') as f:
f.write(ofd_txts)
else:
# 否则,将 OFD 转换为 PDF 进行 OCR
ofd_pdf = ofd2pdf(request.path, request.output_dir, pdfbytes)
ofd_txt = pdf_ocr.ocr_pdf_client(request.config_path,path=ofd_pdf, output_dir=request.output_dir)
# 返回结果
if ofd_txt:
......@@ -217,6 +216,56 @@ async def ofd_ocr(request: ocrRequest):
raise HTTPException(status_code=500, detail="处理文件时发生错误")
# 基于关键词判断 OFD 是否为发票
def check_ofd_by_keywords(filepath):
try:
with open(filepath, "rb") as f:
ofdb64 = str(base64.b64encode(f.read()), "utf-8")
res = OFDParser(ofdb64)() # 假设这是处理 OFD 文件的类
invoice_keywords = ['发票代码', '发票号码', '发票', '开票日期']
# 遍历所有页面并检查关键词
for res_info in res:
one_res = res_info['page_info']
for _ in range(len(one_res)):
# print(_['text_list'])
# print(one_res[_]['text_list'])
text_content = str(one_res[_].get('text_list', ''))
if all(keyword in text_content for keyword in invoice_keywords):
# logger.info(f'关键字判断,是发票')
return True
return False
except Exception as e:
logger.error(f"OFD 文件判断异常: {filepath},报错:{e}")
raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误")
# 基于深度学习模型(如 Qwen)判断 OFD 是否为发票
def check_ofd_by_qwen(filepath, client, text,output_dir):
try:
ofd_imgs, pdfbytes = ofd2img(filepath, output_dir)
for ofd_img in ofd_imgs:
compress_image(ofd_img)
res = client.predict(ofd_img, text)
if 'True' in res: # 假设返回的结果包含 True 或 False 字符串
return True,ofd_imgs, pdfbytes
return False,ofd_imgs, pdfbytes
except Exception as e:
logger.error(f"基于 Qwen 判断 OFD 文件时异常: {filepath},报错:{e}")
raise HTTPException(status_code=500, detail="判断ofd文件类型时发生错误")
# 综合判断 OFD 是否为发票
def check_ofd(filepath,client,output_dir):
# 首先通过关键词检查
if check_ofd_by_keywords(filepath):
# 如果包含所有关键词,进一步使用 Qwen 判断
text = '请判断图片是否为发票,如果是发票,请返回"True",否则返回"False"'
res,ofd_imgs, pdfbytes = check_ofd_by_qwen(filepath, client, text,output_dir)
return res,ofd_imgs, pdfbytes
def main():
args = parse_args()
ocr_pdf_serve(args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment