Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhougaofeng
magic_pdf
Commits
9ab44892
"docs/source/en/vscode:/vscode.git/clone" did not exist on "45f6d52b109604d6754ffefa9e289acd1df92994"
Commit
9ab44892
authored
Oct 24, 2024
by
zhougaofeng
Browse files
Update ocr_mkcontent.py
parent
304bd577
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
9 deletions
+18
-9
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+18
-9
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
9ab44892
...
@@ -11,7 +11,7 @@ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
...
@@ -11,7 +11,7 @@ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.markdown_utils
import
ocr_escape_special_markdown_char
from
magic_pdf.libs.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.libs.ocr_content_type
import
BlockType
,
ContentType
# import pypandoc
# import pypandoc
from
magic_pdf.dict2md.ocr_client
import
PredictClient
,
compress_image
from
magic_pdf.dict2md.ocr_
vllm_
client
import
PredictClient
,
compress_image
...
@@ -130,6 +130,12 @@ def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
...
@@ -130,6 +130,12 @@ def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
url
=
config
.
get
(
'server'
,
'ocr_server'
)
url
=
config
.
get
(
'server'
,
'ocr_server'
)
# logger.info(f'ocr_server:{url}')
# logger.info(f'ocr_server:{url}')
client
=
PredictClient
(
url
)
client
=
PredictClient
(
url
)
status
=
PredictClient
.
check_health
()
if
not
status
:
pdf_ocr
=
None
logger
.
warning
(
f
'Health check failed. The server at "
{
url
}
" is not responding as expected.'
)
logger
.
info
(
f
'Qwen ocr解析服务无法正常运行,暂不使用qwen解析表格服务'
)
return
None
for
para_block
in
paras_of_layout
:
for
para_block
in
paras_of_layout
:
para_text
=
''
para_text
=
''
para_type
=
para_block
[
'type'
]
para_type
=
para_block
[
'type'
]
...
@@ -178,14 +184,17 @@ def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
...
@@ -178,14 +184,17 @@ def ocr_mk_markdown_with_para_core_v2(config_path,paras_of_layout,
else
:
else
:
# 处理图片
# 处理图片
# para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n"
# para_text += f"----------------这是ocr表格内容({join_path(img_buket_path, span['image_path'])})------------------- \n"
text
=
'解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
if
status
:
start
=
time
.
time
()
text
=
'解析图片内容,直接返回一段带有逻辑性的中文书面语描述,要求表达精准,不脱离图片中的实际内容,不要带换行,文中所有的名词不要用指代词'
image_path
=
join_path
(
img_buket_path
,
span
[
'image_path'
])
start
=
time
.
time
()
compress_image
(
image_path
)
image_path
=
join_path
(
img_buket_path
,
span
[
'image_path'
])
generated_text
=
client
.
predict
(
image_path
,
text
)
compress_image
(
image_path
)
end
=
time
.
time
()
generated_text
=
client
.
predict
(
image_path
,
text
)
logger
.
info
(
f
'qwen解析
{
image_path
}
表格的内容为:
{
generated_text
}
,耗时为:
{
end
-
start
}
'
)
end
=
time
.
time
()
para_text
+=
generated_text
logger
.
info
(
f
'qwen解析
{
image_path
}
表格的内容为:
{
generated_text
}
,耗时为:
{
end
-
start
}
'
)
para_text
+=
generated_text
else
:
para_text
+=
f
"----------------图片路径为(
{
join_path
(
img_buket_path
,
span
[
'image_path'
])
}
),请检查qwen ocr服务,重新运行文件解析-------------------
\n
"
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment