Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d47e7b82
Unverified
Commit
d47e7b82
authored
May 06, 2024
by
Kaiwen Liu
Committed by
GitHub
May 06, 2024
Browse files
Merge branch 'magicpdf:master' into master
parents
82489929
7f51d099
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
39 additions
and
18 deletions
+39
-18
demo/demo_commons.bak
demo/demo_commons.bak
+0
-0
demo/download.bak
demo/download.bak
+0
-0
demo/ocr_demo.bak
demo/ocr_demo.bak
+0
-0
demo/pdf2md.bak
demo/pdf2md.bak
+0
-0
demo/s3pdf2md.bak
demo/s3pdf2md.bak
+0
-0
demo/text_demo.bak
demo/text_demo.bak
+0
-0
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+12
-8
magic_pdf/libs/ocr_content_type.py
magic_pdf/libs/ocr_content_type.py
+1
-0
magic_pdf/pdf_parse_union_core.py
magic_pdf/pdf_parse_union_core.py
+11
-6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+4
-1
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+11
-3
No files found.
demo/demo_commons.
py
→
demo/demo_commons.
bak
View file @
d47e7b82
File moved
demo/download.
py
→
demo/download.
bak
View file @
d47e7b82
File moved
demo/ocr_demo.
py
→
demo/ocr_demo.
bak
View file @
d47e7b82
File moved
demo/pdf2md.
py
→
demo/pdf2md.
bak
View file @
d47e7b82
File moved
demo/s3pdf2md.
py
→
demo/s3pdf2md.
bak
View file @
d47e7b82
File moved
demo/text_demo.
py
→
demo/text_demo.
bak
View file @
d47e7b82
File moved
magic_pdf/libs/draw_bbox.py
View file @
d47e7b82
...
@@ -65,14 +65,8 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
...
@@ -65,14 +65,8 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
):
layout_bbox_list
=
[]
layout_bbox_list
=
[]
blocks_bbox_list
=
[]
dropped_bbox_list
=
[]
dropped_bbox_list
=
[]
tables_list
,
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
(
tables_list
,
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
[],
[],
[],
[]
[],
[],
[],
[],
)
imgs_list
,
imgs_body_list
,
imgs_caption_list
=
[],
[],
[]
imgs_list
,
imgs_body_list
,
imgs_caption_list
=
[],
[],
[]
titles_list
=
[]
titles_list
=
[]
texts_list
=
[]
texts_list
=
[]
...
@@ -80,7 +74,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -80,7 +74,6 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_layout_list
=
[]
page_layout_list
=
[]
page_dropped_list
=
[]
page_dropped_list
=
[]
page_blocks_bbox_list
=
[]
tables
,
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[],
[]
tables
,
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[],
[]
imgs
,
imgs_body
,
imgs_caption
=
[],
[],
[]
imgs
,
imgs_body
,
imgs_caption
=
[],
[],
[]
titles
=
[]
titles
=
[]
...
@@ -154,12 +147,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -154,12 +147,22 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
interline_equation_list
=
[]
interline_equation_list
=
[]
image_list
=
[]
image_list
=
[]
table_list
=
[]
table_list
=
[]
dropped_list
=
[]
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_text_list
=
[]
page_text_list
=
[]
page_inline_equation_list
=
[]
page_inline_equation_list
=
[]
page_interline_equation_list
=
[]
page_interline_equation_list
=
[]
page_image_list
=
[]
page_image_list
=
[]
page_table_list
=
[]
page_table_list
=
[]
page_dropped_list
=
[]
# 构造dropped_list
for
block
in
page
[
"discarded_blocks"
]:
if
block
[
"type"
]
==
BlockType
.
Discarded
:
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
page_dropped_list
.
append
(
span
[
"bbox"
])
dropped_list
.
append
(
page_dropped_list
)
# 构造其余useful_list
for
block
in
page
[
"para_blocks"
]:
for
block
in
page
[
"para_blocks"
]:
if
block
[
"type"
]
in
[
if
block
[
"type"
]
in
[
BlockType
.
Text
,
BlockType
.
Text
,
...
@@ -205,6 +208,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
...
@@ -205,6 +208,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
image_list
,
page
,
[
255
,
204
,
0
],
False
)
draw_bbox_without_number
(
i
,
image_list
,
page
,
[
255
,
204
,
0
],
False
)
draw_bbox_without_number
(
i
,
table_list
,
page
,
[
204
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
table_list
,
page
,
[
204
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
dropped_list
,
page
,
[
158
,
158
,
158
],
False
)
# Save the PDF
# Save the PDF
pdf_docs
.
save
(
f
"
{
out_path
}
/spans.pdf"
)
pdf_docs
.
save
(
f
"
{
out_path
}
/spans.pdf"
)
magic_pdf/libs/ocr_content_type.py
View file @
d47e7b82
...
@@ -17,4 +17,5 @@ class BlockType:
...
@@ -17,4 +17,5 @@ class BlockType:
Title
=
"title"
Title
=
"title"
InterlineEquation
=
"interline_equation"
InterlineEquation
=
"interline_equation"
Footnote
=
"footnote"
Footnote
=
"footnote"
Discarded
=
"discarded"
magic_pdf/pdf_parse_union_core.py
View file @
d47e7b82
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
...
@@ -17,7 +17,8 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
combine_chars_to_pymudict
combine_chars_to_pymudict
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
from
magic_pdf.pre_proc.ocr_dict_merge
import
sort_blocks_by_layout
,
fill_spans_in_blocks
,
fix_block_spans
,
\
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -122,15 +123,19 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''将所有区块的bbox整理到一起'''
'''将所有区块的bbox整理到一起'''
all_bboxes
=
ocr_prepare_bboxes_for_layout_split
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
interline_equations
,
page_w
,
page_h
)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
fix_discarded_blocks
=
fix_discarded_block
(
discarded_block_with_spans
)
'''如果当前页面没有bbox则跳过'''
'''如果当前页面没有bbox则跳过'''
if
len
(
all_bboxes
)
==
0
:
if
len
(
all_bboxes
)
==
0
:
logger
.
warning
(
f
"skip this page, not found bbox, page_id:
{
page_id
}
"
)
logger
.
warning
(
f
"skip this page, not found
useful
bbox, page_id:
{
page_id
}
"
)
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
discarded_blocks
,
[],
[],
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
need_drop
,
drop_reason
)
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -171,7 +176,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
sorted_blocks
=
sort_blocks_by_layout
(
all_bboxes
,
layout_bboxes
)
'''将span填入排好序的blocks中'''
'''将span填入排好序的blocks中'''
block_with_spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
)
block_with_spans
,
spans
=
fill_spans_in_blocks
(
sorted_blocks
,
spans
,
0.6
)
'''对block进行fix操作'''
'''对block进行fix操作'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
...
@@ -181,7 +186,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''构造pdf_info_dict'''
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
ocr_construct_page_component_v2
(
fix_blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
,
images
,
tables
,
interline_equations
,
fix_
discarded_blocks
,
need_drop
,
drop_reason
)
need_drop
,
drop_reason
)
return
page_info
return
page_info
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
d47e7b82
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
...
@@ -7,6 +7,7 @@ from magic_pdf.libs.ocr_content_type import BlockType
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
ocr_prepare_bboxes_for_layout_split
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
])
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -38,10 +39,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
])
# 将footnote加入到all_bboxes中,用来计算layout
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Footnote
,
None
,
None
,
None
,
None
])
return
all_bboxes
return
all_bboxes
,
all_discarded_blocks
def
fix_text_overlap_title_blocks
(
all_bboxes
):
def
fix_text_overlap_title_blocks
(
all_bboxes
):
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
d47e7b82
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
...
@@ -141,7 +141,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
return
sort_blocks
return
sort_blocks
def
fill_spans_in_blocks
(
blocks
,
spans
):
def
fill_spans_in_blocks
(
blocks
,
spans
,
radio
):
'''
'''
将allspans中的span按位置关系,放入blocks中
将allspans中的span按位置关系,放入blocks中
'''
'''
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -156,7 +156,7 @@ def fill_spans_in_blocks(blocks, spans):
block_spans
=
[]
block_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
span_bbox
=
span
[
'bbox'
]
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.6
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
radio
:
block_spans
.
append
(
span
)
block_spans
.
append
(
span
)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
...
@@ -178,7 +178,7 @@ def fill_spans_in_blocks(blocks, spans):
for
span
in
block_spans
:
for
span
in
block_spans
:
spans
.
remove
(
span
)
spans
.
remove
(
span
)
return
block_with_spans
return
block_with_spans
,
spans
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
def
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
):
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -204,6 +204,14 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
block
=
fix_text_block
(
block
)
fix_discarded_blocks
.
append
(
block
)
return
fix_discarded_blocks
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
def
merge_spans_to_block
(
spans
:
list
,
block_bbox
:
list
,
block_type
:
str
):
block_spans
=
[]
block_spans
=
[]
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment