Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
11e4e8cc
"vscode:/vscode.git/clone" did not exist on "776dff865e484bf5a00f7b3bf512ca2aaa54f313"
Commit
11e4e8cc
authored
Mar 01, 2024
by
赵小蒙
Browse files
重构目录结构
parent
56213908
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
174 additions
and
36 deletions
+174
-36
demo/download.py
demo/download.py
+0
-0
demo/draw_bbox.py
demo/draw_bbox.py
+0
-0
demo/pdf2md.py
demo/pdf2md.py
+4
-5
demo/s3pdf2md.py
demo/s3pdf2md.py
+1
-9
dict2md/__init__.py
dict2md/__init__.py
+0
-0
dict2md/mkcontent.py
dict2md/mkcontent.py
+0
-0
pdf2json_infer.py
pdf2json_infer.py
+3
-9
pipeline/pdf_parse_by_model.py
pipeline/pdf_parse_by_model.py
+11
-11
post_proc/remove_footnote.py
post_proc/remove_footnote.py
+39
-2
pre_proc/detect_equation.py
pre_proc/detect_equation.py
+0
-0
pre_proc/detect_footer_by_model.py
pre_proc/detect_footer_by_model.py
+0
-0
pre_proc/detect_footer_header_by_statistics.py
pre_proc/detect_footer_header_by_statistics.py
+0
-0
pre_proc/detect_footnote.py
pre_proc/detect_footnote.py
+0
-0
pre_proc/detect_images.py
pre_proc/detect_images.py
+0
-0
pre_proc/fix_image.py
pre_proc/fix_image.py
+0
-0
pre_proc/fix_table.py
pre_proc/fix_table.py
+0
-0
pre_proc/pdf_pre_filter.py
pre_proc/pdf_pre_filter.py
+0
-0
pre_proc/remove_footer_header.py
pre_proc/remove_footer_header.py
+116
-0
No files found.
download.py
→
demo/
download.py
View file @
11e4e8cc
File moved
draw_bbox.py
→
demo/
draw_bbox.py
View file @
11e4e8cc
File moved
pdf2md.py
→
demo/
pdf2md.py
View file @
11e4e8cc
...
@@ -3,12 +3,11 @@ import sys
...
@@ -3,12 +3,11 @@ import sys
from
pathlib
import
Path
from
pathlib
import
Path
import
click
import
click
import
json
from
loguru
import
logger
from
loguru
import
logger
from
libs.commons
import
join_path
,
parse_aws_param
,
parse_bucket_key
,
read_file
from
libs.commons
import
join_path
from
mkcontent
import
mk_mm_markdown
,
mk_nlp_markdown
from
dict2md.
mkcontent
import
mk_mm_markdown
from
pdf_parse_by_model
import
parse_pdf_by_model
from
pipeline.
pdf_parse_by_model
import
parse_pdf_by_model
...
@@ -17,7 +16,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
...
@@ -17,7 +16,7 @@ def main(s3_pdf_path: str, s3_pdf_profile: str, pdf_model_path: str, pdf_model_p
pth
=
Path
(
s3_pdf_path
)
pth
=
Path
(
s3_pdf_path
)
book_name
=
pth
.
name
book_name
=
pth
.
name
# book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
# book_name = "".join(os.path.basename(s3_pdf_path).split(".")[0:-1])
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
".."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"..
/..
"
,
".."
,
"tmp"
,
"unittest"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
save_path
=
join_path
(
save_tmp_path
,
"md"
)
text_content_save_path
=
f
"
{
save_path
}
/
{
book_name
}
/book.md"
text_content_save_path
=
f
"
{
save_path
}
/
{
book_name
}
/book.md"
# metadata_save_path = f"{save_path}/{book_name}/metadata.json"
# metadata_save_path = f"{save_path}/{book_name}/metadata.json"
...
...
s3pdf2md.py
→
demo/
s3pdf2md.py
View file @
11e4e8cc
import
os
import
sys
from
pathlib
import
Path
from
pathlib
import
Path
import
click
import
click
import
json
import
json
from
loguru
import
logger
from
libs.commons
import
join_path
,
parse_aws_param
,
parse_bucket_key
,
read_file
from
mkcontent
import
mk_nlp_markdown
from
pdf2md
import
main
from
pdf_parse_by_model
import
parse_pdf_by_model
from
demo.pdf2md
import
main
@
click
.
command
()
@
click
.
command
()
...
...
__init__.py
→
dict2md/
__init__.py
View file @
11e4e8cc
File moved
mkcontent.py
→
dict2md/
mkcontent.py
View file @
11e4e8cc
File moved
pdf2json_infer.py
View file @
11e4e8cc
import
sys
import
sys
from
typing
import
Tuple
from
typing
import
Tuple
import
os
import
os
import
click
import
boto3
,
json
import
boto3
,
json
from
botocore.config
import
Config
from
botocore.config
import
Config
from
libs.commons
import
fitz
from
libs.commons
import
fitz
...
@@ -24,16 +23,11 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
...
@@ -24,16 +23,11 @@ from validation import cal_edit_distance, format_gt_bbox, label_match, detect_va
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
# from bbox_sort import bbox_sort, CONTENT_IDX, CONTENT_TYPE_IDX
from
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
layout.bbox_sort
import
bbox_sort
,
CONTENT_IDX
,
CONTENT_TYPE_IDX
from
p
df2text_recogFigure
import
parse_images
# 获取figures的bbox
from
p
re_proc.detect_images
import
parse_images
# 获取figures的bbox
from
pdf2text_recogTable
import
parse_tables
# 获取tables的bbox
from
pdf2text_recogTable
import
parse_tables
# 获取tables的bbox
from
pdf2text_recogEquation
import
parse_equations
# 获取equations的bbox
from
pre_proc.detect_equation
import
parse_equations
# 获取equations的bbox
from
pdf2text_recogTitle
import
parse_titles
# 获取titles的bbox
from
pdf2text_recogHeader
import
parse_headers
# 获取headers的bbox
from
pdf2text_recogPageNo
import
parse_pageNos
# 获取pageNos的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
# from pdf2text_recogFootnote import parse_footnotes # 获取footnotes的bbox
from
pdf2text_recogFooter
import
parse_footers
# 获取footers的bbox
from
pdf2text_recogPara
import
process_blocks_per_page
from
pdf2text_evaluatePdfLayout
import
evaluate_pdf_layout
# 评估页面的Layout是否是规整的。
from
pdf2text_recogPara
import
process_blocks_per_page
,
postprocess_paras_pipeline
from
libs.commons
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
from
libs.commons
import
parse_aws_param
,
parse_bucket_key
,
read_file
,
join_path
...
...
pdf_parse_by_model.py
→
pipeline/
pdf_parse_by_model.py
View file @
11e4e8cc
...
@@ -15,14 +15,13 @@ from libs.drop_reason import DropReason
...
@@ -15,14 +15,13 @@ from libs.drop_reason import DropReason
from
libs.markdown_utils
import
escape_special_markdown_char
from
libs.markdown_utils
import
escape_special_markdown_char
from
libs.safe_filename
import
sanitize_filename
from
libs.safe_filename
import
sanitize_filename
from
libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
libs.vis_utils
import
draw_bbox_on_page
,
draw_layout_bbox_on_page
from
pdf2text_recogFigure
import
parse_images
from
pre_proc.detect_images
import
parse_images
from
pdf2text_recogFootnoteLine
import
remove_headder_footer_one_page
# 获取figures的bbox
from
pdf2text_recogTable
import
parse_tables
# 获取tables的bbox
from
pdf2text_recogTable
import
parse_tables
# 获取tables的bbox
from
p
df2
te
x
t_
recogE
quation
import
parse_equations
# 获取equations的bbox
from
p
re_proc.de
te
c
t_
e
quation
import
parse_equations
# 获取equations的bbox
from
pdf2text_recogHeader
import
parse_headers
# 获取headers的bbox
from
pdf2text_recogHeader
import
parse_headers
# 获取headers的bbox
from
pdf2text_recogPageNo
import
parse_pageNos
# 获取pageNos的bbox
from
pdf2text_recogPageNo
import
parse_pageNos
# 获取pageNos的bbox
from
p
df2text_recogF
ootnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
p
re_proc.detect_f
ootnote
import
parse_footnotes_by_model
,
parse_footnotes_by_rule
# 获取footnotes的bbox
from
p
df2
te
x
t_
recogFooter
import
parse_footers
# 获取footers的bbox
from
p
re_proc.de
te
c
t_
footer_by_model
import
parse_footers
# 获取footers的bbox
from
pdf2text_recogPara
import
(
from
pdf2text_recogPara
import
(
ParaProcessPipeline
,
ParaProcessPipeline
,
...
@@ -34,6 +33,7 @@ from pdf2text_recogPara import (
...
@@ -34,6 +33,7 @@ from pdf2text_recogPara import (
)
)
from
pre_proc.main_text_font
import
get_main_text_font
from
pre_proc.main_text_font
import
get_main_text_font
from
pre_proc.remove_colored_strip_bbox
import
remove_colored_strip_textblock
from
pre_proc.remove_colored_strip_bbox
import
remove_colored_strip_textblock
from
pre_proc.remove_footer_header
import
remove_headder_footer_one_page
'''
'''
from para.para_pipeline import ParaProcessPipeline
from para.para_pipeline import ParaProcessPipeline
...
@@ -48,17 +48,17 @@ from para.exceptions import (
...
@@ -48,17 +48,17 @@ from para.exceptions import (
from
libs.commons
import
read_file
,
join_path
from
libs.commons
import
read_file
,
join_path
from
libs.pdf_image_tools
import
save_images_by_bboxes
from
libs.pdf_image_tools
import
save_images_by_bboxes
from
post_proc.footnot
e_remov
e
import
merge_footnote_blocks
,
remove_footnote_blocks
from
post_proc.
remove_
footnote
import
merge_footnote_blocks
,
remove_footnote_blocks
from
pre_proc.citationmarker_remove
import
remove_citation_marker
from
pre_proc.citationmarker_remove
import
remove_citation_marker
from
pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
pre_proc.equations_replace
import
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
from
pre_proc.pdf_filter
import
pdf_filter
from
pre_proc.pdf_
pre_
filter
import
pdf_filter
from
pre_proc.detect_footer_header
import
drop_footer_header
from
pre_proc.detect_footer_header
_by_statistics
import
drop_footer_header
from
pre_proc.construct_paras
import
construct_page_component
from
pre_proc.construct_paras
import
construct_page_component
from
pre_proc.image
_fix
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
pre_proc.
fix_
image
import
combine_images
,
fix_image_vertical
,
fix_seperated_image
,
include_img_title
from
post_proc.pdf_post_filter
import
pdf_post_filter
from
post_proc.pdf_post_filter
import
pdf_post_filter
from
pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
pre_proc.remove_rotate_bbox
import
get_side_boundry
,
remove_rotate_side_textblock
,
remove_side_blank_block
from
pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
pre_proc.resolve_bbox_conflict
import
check_text_block_horizontal_overlap
,
resolve_bbox_overlap_conflict
from
pre_proc.table
_fix
import
fix_table_text_block
,
fix_tables
,
include_table_title
from
pre_proc.
fix_
table
import
fix_table_text_block
,
fix_tables
,
include_table_title
denseSingleLineBlockException_msg
=
DenseSingleLineBlockException
().
message
denseSingleLineBlockException_msg
=
DenseSingleLineBlockException
().
message
titleDetectionException_msg
=
TitleDetectionException
().
message
titleDetectionException_msg
=
TitleDetectionException
().
message
...
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
...
@@ -108,7 +108,7 @@ def parse_pdf_by_model(
debug_mode
=
False
,
debug_mode
=
False
,
):
):
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
pdf_bytes
=
read_file
(
s3_pdf_path
,
s3_pdf_profile
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"tmp"
,
"unittest"
)
save_tmp_path
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"..
/..
"
,
"tmp"
,
"unittest"
)
md_bookname_save_path
=
""
md_bookname_save_path
=
""
book_name
=
sanitize_filename
(
book_name
)
book_name
=
sanitize_filename
(
book_name
)
if
debug_mode
:
if
debug_mode
:
...
...
post_proc/footnot
e_remov
e.py
→
post_proc/
remove_
footnote.py
View file @
11e4e8cc
from
libs.boxbase
import
_is_in
from
libs.boxbase
import
_is_in
,
_is_in_or_part_overlap
from
pdf2text_recogFootnoteLine
import
remove_footnote_text
,
remove_footnote_image
import
collections
# 统计库
import
collections
# 统计库
...
@@ -113,3 +112,41 @@ def remove_footnote_blocks(page_info):
...
@@ -113,3 +112,41 @@ def remove_footnote_blocks(page_info):
del
page_info
[
'merged_bboxes'
]
del
page_info
[
'merged_bboxes'
]
del
page_info
[
'footnote_bboxes_tmp'
]
del
page_info
[
'footnote_bboxes_tmp'
]
return
page_info
return
page_info
def
remove_footnote_text
(
raw_text_block
,
footnote_bboxes
):
"""
:param raw_text_block: str类型,是当前页的文本内容
:param footnoteBboxes: list类型,是当前页的脚注bbox
"""
footnote_text_blocks
=
[]
for
block
in
raw_text_block
:
text_bbox
=
block
[
'bbox'
]
# TODO 更严谨点在line级别做
if
any
([
_is_in_or_part_overlap
(
text_bbox
,
footnote_bbox
)
for
footnote_bbox
in
footnote_bboxes
]):
# if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
block
[
'tag'
]
=
'footnote'
footnote_text_blocks
.
append
(
block
)
# raw_text_block.remove(block)
# 移除,不能再内部移除,否则会出错
for
block
in
footnote_text_blocks
:
raw_text_block
.
remove
(
block
)
return
raw_text_block
,
footnote_text_blocks
def
remove_footnote_image
(
image_blocks
,
footnote_bboxes
):
"""
:param image_bboxes: list类型,是当前页的图片bbox(结构体)
:param footnoteBboxes: list类型,是当前页的脚注bbox
"""
footnote_imgs_blocks
=
[]
for
image_block
in
image_blocks
:
if
any
([
_is_in
(
image_block
[
'bbox'
],
footnote_bbox
)
for
footnote_bbox
in
footnote_bboxes
]):
footnote_imgs_blocks
.
append
(
image_block
)
for
footnote_imgs_block
in
footnote_imgs_blocks
:
image_blocks
.
remove
(
footnote_imgs_block
)
return
image_blocks
,
footnote_imgs_blocks
\ No newline at end of file
p
df2
te
x
t_
recogE
quation.py
→
p
re_proc/de
te
c
t_
e
quation.py
View file @
11e4e8cc
File moved
p
df2
te
x
t_
recogFooter
.py
→
p
re_proc/de
te
c
t_
footer_by_model
.py
View file @
11e4e8cc
File moved
pre_proc/detect_footer_header.py
→
pre_proc/detect_footer_header
_by_statistics
.py
View file @
11e4e8cc
File moved
p
df2text_recogF
ootnote.py
→
p
re_proc/detect_f
ootnote.py
View file @
11e4e8cc
File moved
p
df2text_recogFigure
.py
→
p
re_proc/detect_images
.py
View file @
11e4e8cc
File moved
pre_proc/image
_fix
.py
→
pre_proc/
fix_
image.py
View file @
11e4e8cc
File moved
pre_proc/table
_fix
.py
→
pre_proc/
fix_
table.py
View file @
11e4e8cc
File moved
pre_proc/pdf_filter.py
→
pre_proc/pdf_
pre_
filter.py
View file @
11e4e8cc
File moved
pre_proc/remove_footer_header.py
0 → 100644
View file @
11e4e8cc
import
re
from
libs.boxbase
import
_is_in_or_part_overlap
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
page_no_bboxs
,
page_w
,
page_h
):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
header
=
[]
footer
=
[]
if
len
(
header
)
==
0
:
model_header
=
header_bboxs
if
model_header
:
x0
=
min
([
x
for
x
,
_
,
_
,
_
in
model_header
])
y0
=
min
([
y
for
_
,
y
,
_
,
_
in
model_header
])
x1
=
max
([
x1
for
_
,
_
,
x1
,
_
in
model_header
])
y1
=
max
([
y1
for
_
,
_
,
_
,
y1
in
model_header
])
header
=
[
x0
,
y0
,
x1
,
y1
]
if
len
(
footer
)
==
0
:
model_footer
=
footer_bboxs
if
model_footer
:
x0
=
min
([
x
for
x
,
_
,
_
,
_
in
model_footer
])
y0
=
min
([
y
for
_
,
y
,
_
,
_
in
model_footer
])
x1
=
max
([
x1
for
_
,
_
,
x1
,
_
in
model_footer
])
y1
=
max
([
y1
for
_
,
_
,
_
,
y1
in
model_footer
])
footer
=
[
x0
,
y0
,
x1
,
y1
]
header_y0
=
0
if
len
(
header
)
==
0
else
header
[
3
]
footer_y0
=
page_h
if
len
(
footer
)
==
0
else
footer
[
1
]
if
page_no_bboxs
:
top_part
=
[
b
for
b
in
page_no_bboxs
if
b
[
3
]
<
page_h
/
2
]
btn_part
=
[
b
for
b
in
page_no_bboxs
if
b
[
1
]
>
page_h
/
2
]
top_max_y0
=
max
([
b
[
1
]
for
b
in
top_part
])
if
top_part
else
0
btn_min_y1
=
min
([
b
[
3
]
for
b
in
btn_part
])
if
btn_part
else
page_h
header_y0
=
max
(
header_y0
,
top_max_y0
)
footer_y0
=
min
(
footer_y0
,
btn_min_y1
)
content_boundry
=
[
0
,
header_y0
,
page_w
,
footer_y0
]
header
=
[
0
,
0
,
page_w
,
header_y0
]
footer
=
[
0
,
footer_y0
,
page_w
,
page_h
]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove
=
[]
# 首先检查每个textblock
for
blk
in
text_raw_blocks
:
if
len
(
blk
[
'lines'
])
>
0
:
for
line
in
blk
[
'lines'
]:
line_del
=
[]
for
span
in
line
[
'spans'
]:
span_del
=
[]
if
span
[
'bbox'
][
3
]
<
header_y0
:
span_del
.
append
(
span
)
elif
_is_in_or_part_overlap
(
span
[
'bbox'
],
header
)
or
_is_in_or_part_overlap
(
span
[
'bbox'
],
footer
):
span_del
.
append
(
span
)
for
span
in
span_del
:
line
[
'spans'
].
remove
(
span
)
if
not
line
[
'spans'
]:
line_del
.
append
(
line
)
for
line
in
line_del
:
blk
[
'lines'
].
remove
(
line
)
else
:
# if not blk['lines']:
blk
[
'tag'
]
=
'in-foot-header-area'
text_block_to_remove
.
append
(
blk
)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove
=
[]
if
page_no_bboxs
:
for
pagenobox
in
page_no_bboxs
:
for
block
in
text_raw_blocks
:
if
_is_in_or_part_overlap
(
pagenobox
,
block
[
'bbox'
]):
# 在span级别删除页码
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
'bbox'
]):
# span['text'] = ''
span
[
'tag'
]
=
"page-no"
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if
len
(
line
[
'spans'
])
==
1
and
len
(
block
[
'lines'
])
==
1
:
page_no_block_2_remove
.
append
(
block
)
else
:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if
len
(
text_raw_blocks
)
>
0
:
text_raw_blocks
.
sort
(
key
=
lambda
x
:
x
[
'bbox'
][
1
],
reverse
=
True
)
last_block
=
text_raw_blocks
[
0
]
if
len
(
last_block
[
'lines'
])
==
1
:
last_line
=
last_block
[
'lines'
][
0
]
if
len
(
last_line
[
'spans'
])
==
1
:
last_span
=
last_line
[
'spans'
][
0
]
if
last_span
[
'text'
].
strip
()
and
not
re
.
search
(
'[a-zA-Z]'
,
last_span
[
'text'
])
and
re
.
search
(
'[0-9]'
,
last_span
[
'text'
]):
last_span
[
'tag'
]
=
"page-no"
page_no_block_2_remove
.
append
(
last_block
)
for
b
in
page_no_block_2_remove
:
text_block_to_remove
.
append
(
b
)
for
blk
in
text_block_to_remove
:
if
blk
in
text_raw_blocks
:
text_raw_blocks
.
remove
(
blk
)
text_block_remain
=
text_raw_blocks
image_bbox_to_remove
=
[
bbox
for
bbox
in
image_bboxes
if
not
_is_in_or_part_overlap
(
bbox
,
content_boundry
)]
image_bbox_remain
=
[
bbox
for
bbox
in
image_bboxes
if
_is_in_or_part_overlap
(
bbox
,
content_boundry
)]
table_bbox_to_remove
=
[
bbox
for
bbox
in
table_bboxes
if
not
_is_in_or_part_overlap
(
bbox
,
content_boundry
)]
table_bbox_remain
=
[
bbox
for
bbox
in
table_bboxes
if
_is_in_or_part_overlap
(
bbox
,
content_boundry
)]
return
image_bbox_remain
,
table_bbox_remain
,
text_block_remain
,
text_block_to_remove
,
image_bbox_to_remove
,
table_bbox_to_remove
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment