Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
e986ba8a
Commit
e986ba8a
authored
Nov 13, 2024
by
zhougaofeng
Browse files
Update pdf_server.py
parent
7b3cb3b2
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
72 additions
and
23 deletions
+72
-23
magic_pdf/tools/pdf_server.py
magic_pdf/tools/pdf_server.py
+72
-23
No files found.
magic_pdf/tools/pdf_server.py
View file @
e986ba8a
...
...
@@ -20,7 +20,7 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
# from magic_pdf.dict2md.ocr_client import PredictClient,compress_image
from
magic_pdf.parse.pdf_client
import
ocrPdfClient
from
magic_pdf.parse.ofd_parse
import
*
from
magic_pdf.tools.ofd_parser
import
OFDParser
app
=
FastAPI
()
...
...
@@ -173,27 +173,22 @@ async def ofd_ocr(request: ocrRequest):
# 确保输出目录存在
os
.
makedirs
(
request
.
output_dir
,
exist_ok
=
True
)
# 处理 OFD 文件
ofd_imgs
,
pdfbytes
=
ofd2img
(
request
.
path
,
request
.
output_dir
)
text
=
'识别图片的内容,如果是发票就执行以下操作识别图中的文字信息,并以json格式返回,如果不是发票返回False'
# 判断 OFD 是否为发票
logger
.
info
(
f
'正在判断ofd文件类型'
)
check_res
,
ofd_imgs
,
pdfbytes
=
check_ofd
(
request
.
path
,
client
,
request
.
output_dir
)
text
=
'识别图片的内容,如果是发票就识别图中的文字信息,并以json格式返回'
# 初始化变量
ofd_txts
=
''
ofd_txt
=
''
# 遍历 OFD 图片,逐一进行识别
# 判断 OFD 是否为发票
if
check_res
:
# 如果是发票,进行 OCR 识别
for
ofd_img
in
ofd_imgs
:
compress_image
(
ofd_img
)
res
=
client
.
predict
(
ofd_img
,
text
)
# 如果识别结果是非发票,则尝试解析 PDF
if
'False'
in
res
or
'false'
in
res
:
ofd_pdf
=
ofd2pdf
(
request
.
path
,
request
.
output_dir
,
pdfbytes
)
ofd_txt
=
pdf_ocr
.
ocr_pdf_client
(
path
=
ofd_pdf
,
output_dir
=
request
.
output_dir
)
break
else
:
# 处理识别结果
res
=
decode_html_entities
(
res
)
res
=
json_to_txt
(
res
)
ofd_txts
+=
res
+
'
\n
'
...
...
@@ -203,6 +198,10 @@ async def ofd_ocr(request: ocrRequest):
ofd_txt
=
os
.
path
.
join
(
request
.
output_dir
,
f
"
{
file_name
}
.txt"
)
with
open
(
ofd_txt
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
ofd_txts
)
else
:
# 否则,将 OFD 转换为 PDF 进行 OCR
ofd_pdf
=
ofd2pdf
(
request
.
path
,
request
.
output_dir
,
pdfbytes
)
ofd_txt
=
pdf_ocr
.
ocr_pdf_client
(
request
.
config_path
,
path
=
ofd_pdf
,
output_dir
=
request
.
output_dir
)
# 返回结果
if
ofd_txt
:
...
...
@@ -217,6 +216,56 @@ async def ofd_ocr(request: ocrRequest):
raise
HTTPException
(
status_code
=
500
,
detail
=
"处理文件时发生错误"
)
# 基于关键词判断 OFD 是否为发票
def
check_ofd_by_keywords
(
filepath
):
try
:
with
open
(
filepath
,
"rb"
)
as
f
:
ofdb64
=
str
(
base64
.
b64encode
(
f
.
read
()),
"utf-8"
)
res
=
OFDParser
(
ofdb64
)()
# 假设这是处理 OFD 文件的类
invoice_keywords
=
[
'发票代码'
,
'发票号码'
,
'发票'
,
'开票日期'
]
# 遍历所有页面并检查关键词
for
res_info
in
res
:
one_res
=
res_info
[
'page_info'
]
for
_
in
range
(
len
(
one_res
)):
# print(_['text_list'])
# print(one_res[_]['text_list'])
text_content
=
str
(
one_res
[
_
].
get
(
'text_list'
,
''
))
if
all
(
keyword
in
text_content
for
keyword
in
invoice_keywords
):
# logger.info(f'关键字判断,是发票')
return
True
return
False
except
Exception
as
e
:
logger
.
error
(
f
"OFD 文件判断异常:
{
filepath
}
,报错:
{
e
}
"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
"判断ofd文件类型时发生错误"
)
# 基于深度学习模型(如 Qwen)判断 OFD 是否为发票
def
check_ofd_by_qwen
(
filepath
,
client
,
text
,
output_dir
):
try
:
ofd_imgs
,
pdfbytes
=
ofd2img
(
filepath
,
output_dir
)
for
ofd_img
in
ofd_imgs
:
compress_image
(
ofd_img
)
res
=
client
.
predict
(
ofd_img
,
text
)
if
'True'
in
res
:
# 假设返回的结果包含 True 或 False 字符串
return
True
,
ofd_imgs
,
pdfbytes
return
False
,
ofd_imgs
,
pdfbytes
except
Exception
as
e
:
logger
.
error
(
f
"基于 Qwen 判断 OFD 文件时异常:
{
filepath
}
,报错:
{
e
}
"
)
raise
HTTPException
(
status_code
=
500
,
detail
=
"判断ofd文件类型时发生错误"
)
# 综合判断 OFD 是否为发票
def
check_ofd
(
filepath
,
client
,
output_dir
):
# 首先通过关键词检查
if
check_ofd_by_keywords
(
filepath
):
# 如果包含所有关键词,进一步使用 Qwen 判断
text
=
'请判断图片是否为发票,如果是发票,请返回"True",否则返回"False"'
res
,
ofd_imgs
,
pdfbytes
=
check_ofd_by_qwen
(
filepath
,
client
,
text
,
output_dir
)
return
res
,
ofd_imgs
,
pdfbytes
def
main
():
args
=
parse_args
()
ocr_pdf_serve
(
args
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment