Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
83753cbd
"git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "4a24ed729ffc7a6d7df668bdb524536ffdef8cd9"
Commit
83753cbd
authored
Mar 16, 2024
by
xuchao
Browse files
元素类型引用统一定义
parent
d5ea44f9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
23 additions
and
16 deletions
+23
-16
demo/ocr_demo.py
demo/ocr_demo.py
+4
-4
magic_pdf/dict2md/mkcontent.py
magic_pdf/dict2md/mkcontent.py
+5
-1
magic_pdf/para/para_split.py
magic_pdf/para/para_split.py
+14
-11
No files found.
demo/ocr_demo.py
View file @
83753cbd
...
@@ -30,13 +30,13 @@ def read_json_file(file_path):
...
@@ -30,13 +30,13 @@ def read_json_file(file_path):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
ocr_pdf_path
=
r
"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
ocr_json_file_path
=
r
"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
#
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
#
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
ocr_pdf_path
=
r
"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
#
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
ocr_json_file_path
=
r
"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
try
:
try
:
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
ocr_pdf_model_info
=
read_json_file
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
pth
=
Path
(
ocr_json_file_path
)
...
...
magic_pdf/dict2md/mkcontent.py
View file @
83753cbd
...
@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
...
@@ -326,7 +326,11 @@ def mk_mm_markdown(content_list):
if
content_type
==
"text"
:
if
content_type
==
"text"
:
content_md
.
append
(
c
.
get
(
"text"
))
content_md
.
append
(
c
.
get
(
"text"
))
elif
content_type
==
"equation"
:
elif
content_type
==
"equation"
:
content_md
.
append
(
f
"$$
\n
{
c
.
get
(
'latex'
)
}
\n
$$"
)
content
=
c
.
get
(
"latex"
)
if
content
.
startswith
(
"$$"
)
and
content
.
endswith
(
"$$"
):
content_md
.
append
(
content
)
else
:
content_md
.
append
(
f
"
\n
$$
\n
{
c
.
get
(
'latex'
)
}
\n
$$
\n
"
)
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
elif
content_type
in
UNI_FORMAT_TEXT_TYPE
:
content_md
.
append
(
f
"
{
'#'
*
int
(
content_type
[
1
])
}
{
c
.
get
(
'text'
)
}
"
)
content_md
.
append
(
f
"
{
'#'
*
int
(
content_type
[
1
])
}
{
c
.
get
(
'text'
)
}
"
)
elif
content_type
==
"image"
:
elif
content_type
==
"image"
:
...
...
magic_pdf/para/para_split.py
View file @
83753cbd
...
@@ -3,11 +3,12 @@ import numpy as np
...
@@ -3,11 +3,12 @@ import numpy as np
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.boxbase
import
_is_in
from
magic_pdf.libs.ocr_content_type
import
ContentType
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
LINE_STOP_FLAG
=
[
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
":"
,
":"
,
")"
,
")"
,
";"
]
INLINE_EQUATION
=
'i
nline
_e
quation
'
INLINE_EQUATION
=
ContentType
.
I
nline
E
quation
INTER_EQUATION
=
"displayed_e
quation
"
INTER
LINE
_EQUATION
=
ContentType
.
InterlineE
quation
TEXT
=
"text"
TEXT
=
"text"
def
__add_line_period
(
blocks
,
layout_bboxes
):
def
__add_line_period
(
blocks
,
layout_bboxes
):
...
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
...
@@ -20,20 +21,19 @@ def __add_line_period(blocks, layout_bboxes):
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
last_span
=
line
[
'spans'
][
-
1
]
last_span
=
line
[
'spans'
][
-
1
]
span_type
=
last_span
[
'type'
]
span_type
=
last_span
[
'type'
]
if
span_type
in
[
TEXT
,
INLINE_EQUATION
]:
if
span_type
in
[
INLINE_EQUATION
]:
span_content
=
last_span
[
'content'
].
strip
()
span_content
=
last_span
[
'content'
].
strip
()
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
==
INLINE_EQUATION
and
span_content
[
-
1
]
not
in
LINE_STOP_FLAG
:
if
span_type
in
[
INLINE_EQUATION
,
INTER_EQUATION
]:
if
span_type
in
[
INLINE_EQUATION
,
INTER
LINE
_EQUATION
]:
last_span
[
'content'
]
=
span_content
+
'.'
last_span
[
'content'
]
=
span_content
+
'.'
def
__valign_lines
(
blocks
,
layout_bboxes
):
def
__valign_lines
(
blocks
,
layout_bboxes
):
"""
"""
对齐行的左侧和右侧。
在一个layoutbox内对齐行的左侧和右侧。
扫描行的左侧和右侧,如果x0, x1差距不超过3就强行对齐到所处layout的左右两侧(和layout有一段距离)。
扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。
3是个经验值,TODO,计算得来
3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。
"""
"""
min_distance
=
3
min_distance
=
3
...
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
...
@@ -159,11 +159,14 @@ def __split_para_in_layoutbox(lines_group, layout_bboxes, lang="en", char_avg_le
else
:
else
:
para
.
append
(
line
)
para
.
append
(
line
)
else
:
# 其他,图片、表格、行间公式,各自占一段
else
:
# 其他,图片、表格、行间公式,各自占一段
para
.
append
(
line
)
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
para
=
[]
else
:
paras
.
append
([
line
])
para
=
[]
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# logger.info(para_text)
# logger.info(para_text)
para
=
[]
if
len
(
para
)
>
0
:
if
len
(
para
)
>
0
:
paras
.
append
(
para
)
paras
.
append
(
para
)
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
# para_text = ''.join([get_span_text(span) for line in para for span in line['spans']])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment