Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
b2bb218c
Commit
b2bb218c
authored
Nov 13, 2024
by
zhougaofeng
Browse files
Update ofd_parse.py
parent
fb058635
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
36 deletions
+40
-36
magic_pdf/parse/ofd_parse.py
magic_pdf/parse/ofd_parse.py
+40
-36
No files found.
magic_pdf/parse/ofd_parse.py
View file @
b2bb218c
...
...
@@ -8,6 +8,10 @@ from magic_pdf.dict2md.ocr_vllm_client import PredictClient,compress_image
import
configparser
from
magic_pdf.parse.pdf_client
import
ocrPdfClient
import
html
import
requests
def
decode_html_entities
(
text
):
# 将 HTML 实体转换为相应的字符
...
...
@@ -90,40 +94,40 @@ def ofd2img(file_path,output_dir):
return
output_files
,
pdfbytes
def
parse_ofd
(
config_path
,
file_path
,
output_dir
):
config
=
configparser
.
ConfigParser
()
config
.
read
(
config_path
)
url
=
config
.
get
(
'server'
,
'ocr_server'
)
client
=
PredictClient
(
url
)
ofd_imgs
,
pdfbytes
=
ofd2img
(
file_path
,
output_dir
)
# logger.info(f'url:{url}\tofd_img:{ofd_imgs}')
text
=
'判断图片是否是发票,如果是发票精确提取图片中的内容,否则返回False'
ofd_txts
=
''
for
ofd_img
in
ofd_imgs
:
compress_image
(
ofd_img
)
res
=
client
.
predict
(
ofd_img
,
text
)
if
'False'
in
res
or
'false'
in
res
:
ofd_pdf
=
ofd2pdf
(
file_path
,
output_dir
,
pdfbytes
)
logger
.
info
(
f
'ofd_pdf:
{
ofd_pdf
}
'
)
pdf_server
=
config
.
get
(
'server'
,
'pdf_server'
)
pdf_ocr
=
ocrPdfClient
(
pdf_server
)
ofd_txt
=
pdf_ocr
.
ocr_pdf_client
(
path
=
ofd_pdf
,
output_dir
=
output_dir
)
break
class
ocrOfdClient
:
def
__init__
(
self
,
api_url
):
self
.
api_url
=
api_url
def
check_health
(
self
):
health_check_url
=
f
'
{
self
.
api_url
}
/health'
try
:
response
=
requests
.
get
(
health_check_url
)
if
response
.
status_code
==
200
:
logger
.
info
(
"Server is healthy and ready to process requests."
)
return
True
else
:
logger
.
error
(
f
'Server health check failed with status code:
{
response
.
status_code
}
'
)
return
False
except
requests
.
exceptions
.
RequestException
as
e
:
logger
.
error
(
f
'Health check request failed:
{
e
}
'
)
return
False
def
parse_ofd
(
self
,
config_path
,
file_path
,
output_dir
):
# 构造请求数据
data
=
{
"path"
:
str
(
file_path
),
"output_dir"
:
str
(
output_dir
),
"config_path"
:
str
(
config_path
),
}
# 发送 POST 请求
response
=
requests
.
post
(
f
"
{
self
.
api_url
}
/ofd_ocr"
,
json
=
data
)
# 处理响应
if
response
.
status_code
==
200
:
result
=
response
.
json
()
logger
.
info
(
f
"文件解析成功,输出路径:
{
result
[
'output_path'
]
}
"
)
return
result
[
'output_path'
]
else
:
res
=
decode_html_entities
(
res
)
res
=
json_to_txt
(
res
)
ofd_txts
=
ofd_txts
+
res
+
'
\n
'
if
ofd_txts
!=
''
:
file_name
=
os
.
path
.
basename
(
file_name
).
split
(
'.'
)
ofd_txt
=
os
.
path
.
join
(
output_dir
,
file_name
)
+
'.txt'
logger
.
info
(
f
'ofd_txt:
{
ofd_txt
}
'
)
with
open
(
ofd_txt
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
str
(
ofd_txts
))
return
ofd_txt
#
# if __name__ == '__main__':
# file_path = ''
# out_path = ''
# ofd2pdf()
logger
.
error
(
f
"文件解析失败,错误信息:
{
response
.
json
()
}
"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment