Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
c0a9d1c7
Commit
c0a9d1c7
authored
Oct 25, 2024
by
zhougaofeng
Browse files
Update ocr_mkcontent.py
parent
d4e904ba
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
18 deletions
+20
-18
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+20
-18
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
c0a9d1c7
...
...
@@ -17,9 +17,8 @@ from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# 普通 非vllm
from
magic_pdf.dict2md.ocr_client
import
PredictClient
,
compress_image
client
=
None
status
=
None
def
__is_hyphen_at_line_end
(
line
):
"""
...
...
@@ -127,21 +126,12 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
return
page_markdown
def
ocr_mk_markdown_with_para_core_v2
(
config_path
,
paras_of_layout
,
def
ocr_mk_markdown_with_para_core_v2
(
paras_of_layout
,
mode
,
img_buket_path
=
''
):
page_markdown
=
[]
config
=
configparser
.
ConfigParser
()
config
.
read
(
config_path
)
url
=
config
.
get
(
'server'
,
'ocr_server'
)
# logger.info(f'ocr_server:{url}')
client
=
PredictClient
(
url
)
status
=
client
.
check_health
()
if
not
status
:
pdf_ocr
=
None
logger
.
warning
(
f
'Health check failed. The server at "
{
url
}
" is not responding as expected.'
)
logger
.
info
(
f
'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务'
)
return
None
for
para_block
in
paras_of_layout
:
para_text
=
''
para_type
=
para_block
[
'type'
]
...
...
@@ -434,12 +424,24 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
return
content_list
def
union_make
(
config_path
:
str
,
def
union_make
(
config_path
:
str
,
pdf_info_dict
:
list
,
make_mode
:
str
,
drop_mode
:
str
,
img_buket_path
:
str
=
''
):
output_content
=
[]
global
client
global
status
config
=
configparser
.
ConfigParser
()
config
.
read
(
config_path
)
url
=
config
.
get
(
'server'
,
'ocr_server'
)
# logger.info(f'ocr_server:{url}')
client
=
PredictClient
(
url
)
status
=
client
.
check_health
()
if
not
status
:
logger
.
warning
(
f
'Health check failed. The server at "
{
url
}
" is not responding as expected.'
)
logger
.
info
(
f
'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务'
)
for
page_info
in
pdf_info_dict
:
if
page_info
.
get
(
'need_drop'
,
False
):
drop_reason
=
page_info
.
get
(
'drop_reason'
)
...
...
@@ -462,11 +464,11 @@ def union_make(config_path: str,
continue
if
make_mode
==
MakeMode
.
MM_MD
:
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
config_path
,
paras_of_layout
,
'mm'
,
img_buket_path
)
paras_of_layout
,
'mm'
,
img_buket_path
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
NLP_MD
:
page_markdown
=
ocr_mk_markdown_with_para_core_v2
(
config_path
,
paras_of_layout
,
'nlp'
)
paras_of_layout
,
'nlp'
)
output_content
.
extend
(
page_markdown
)
elif
make_mode
==
MakeMode
.
STANDARD_FORMAT
:
for
para_block
in
paras_of_layout
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment