Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
2de5a79f
Unverified
Commit
2de5a79f
authored
Apr 16, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Apr 16, 2025
Browse files
Merge pull request #2251 from myhloli/dev
feat(pdf_parse): add footnote block handling in layout split
parents
cfa90743
058d3184
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
5 deletions
+9
-5
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+8
-4
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+1
-1
No files found.
magic_pdf/pdf_parse_union_core_v2.py
View file @
2de5a79f
...
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
...
@@ -490,7 +490,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
return
[[
x0
,
y0
,
x1
,
y1
]]
return
[[
x0
,
y0
,
x1
,
y1
]]
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
):
page_line_list
=
[]
page_line_list
=
[]
def
add_lines_to_block
(
b
):
def
add_lines_to_block
(
b
):
...
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
...
@@ -519,6 +519,10 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
add_lines_to_block
(
block
)
add_lines_to_block
(
block
)
for
block
in
footnote_blocks
:
footnote_block
=
{
'bbox'
:
block
[:
4
]}
add_lines_to_block
(
footnote_block
)
if
len
(
page_line_list
)
>
200
:
# layoutreader最高支持512line
if
len
(
page_line_list
)
>
200
:
# layoutreader最高支持512line
return
None
return
None
...
@@ -779,7 +783,7 @@ def parse_page_core(
...
@@ -779,7 +783,7 @@ def parse_page_core(
# interline_equation_blocks参数不够准,后面切换到interline_equations上
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks
=
[]
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
discarded_blocks
,
...
@@ -790,7 +794,7 @@ def parse_page_core(
...
@@ -790,7 +794,7 @@ def parse_page_core(
page_h
,
page_h
,
)
)
else
:
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
discarded_blocks
,
...
@@ -866,7 +870,7 @@ def parse_page_core(
...
@@ -866,7 +870,7 @@ def parse_page_core(
line_height
=
get_line_height
(
fix_blocks
)
line_height
=
get_line_height
(
fix_blocks
)
"""获取所有line并对line排序"""
"""获取所有line并对line排序"""
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
)
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
)
"""根据line的中位数算block的序列关系"""
"""根据line的中位数算block的序列关系"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
2de5a79f
...
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
...
@@ -119,7 +119,7 @@ def ocr_prepare_bboxes_for_layout_split_v2(
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
"""将剩余的bbox做分离处理,防止后面分layout时出错"""
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
# all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
all_bboxes
.
sort
(
key
=
lambda
x
:
x
[
0
]
+
x
[
1
])
return
all_bboxes
,
all_discarded_blocks
return
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment