Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b6f051d8
Commit
b6f051d8
authored
Mar 14, 2024
by
赵小蒙
Browse files
在layout.pdf中绘制drop的bbox
parent
85587b25
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
21 additions
and
11 deletions
+21
-11
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+1
-1
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+10
-3
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+5
-2
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+1
-1
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+4
-4
No files found.
magic_pdf/dict2md/ocr_mkcontent.py
View file @
b6f051d8
def
mk_nlp_markdown
(
pdf_info_dict
:
dict
):
def
mk_nlp_markdown
(
pdf_info_dict
:
dict
):
markdown
=
[]
markdown
=
[]
for
_
,
page_info
in
pdf_info_dict
.
items
():
for
_
,
page_info
in
pdf_info_dict
.
items
():
...
@@ -22,6 +21,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
...
@@ -22,6 +21,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
markdown
.
append
(
line_text
.
strip
()
+
' '
)
markdown
.
append
(
line_text
.
strip
()
+
' '
)
return
'
\n
'
.
join
(
markdown
)
return
'
\n
'
.
join
(
markdown
)
def
mk_mm_markdown
(
pdf_info_dict
:
dict
):
def
mk_mm_markdown
(
pdf_info_dict
:
dict
):
markdown
=
[]
markdown
=
[]
...
...
magic_pdf/libs/draw_bbox.py
View file @
b6f051d8
...
@@ -27,15 +27,22 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
...
@@ -27,15 +27,22 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
def
draw_layout_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
def
draw_layout_bbox
(
pdf_info_dict
,
input_path
,
out_path
):
layout_bbox_list
=
[]
layout_bbox_list
=
[]
dropped_bbox_list
=
[]
for
page
in
pdf_info_dict
.
values
():
for
page
in
pdf_info_dict
.
values
():
page_list
=
[]
page_layout_list
=
[]
page_dropped_list
=
[]
for
layout
in
page
[
'layout_bboxes'
]:
for
layout
in
page
[
'layout_bboxes'
]:
page_list
.
append
(
layout
[
'layout_bbox'
])
page_layout_list
.
append
(
layout
[
'layout_bbox'
])
layout_bbox_list
.
append
(
page_list
)
layout_bbox_list
.
append
(
page_layout_list
)
for
drop_tag
,
dropped_bboxes
in
page
[
'dropped_bboxes'
].
items
():
for
dropped_bbox
in
dropped_bboxes
:
page_dropped_list
.
append
(
dropped_bbox
)
dropped_bbox_list
.
append
(
page_dropped_list
)
doc
=
fitz
.
open
(
input_path
)
doc
=
fitz
.
open
(
input_path
)
for
i
,
page
in
enumerate
(
doc
):
for
i
,
page
in
enumerate
(
doc
):
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
])
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
0
,
255
,
0
])
# Save the PDF
# Save the PDF
doc
.
save
(
f
"
{
out_path
}
/layout.pdf"
)
doc
.
save
(
f
"
{
out_path
}
/layout.pdf"
)
...
...
magic_pdf/pdf_parse_by_ocr.py
View file @
b6f051d8
...
@@ -32,7 +32,8 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
...
@@ -32,7 +32,8 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
def
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
):
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
need_remove_spans_bboxes_dict
):
return_dict
=
{
return_dict
=
{
'preproc_blocks'
:
blocks
,
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'layout_bboxes'
:
layout_bboxes
,
...
@@ -46,6 +47,7 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
...
@@ -46,6 +47,7 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
'dropped_text_block'
:
dropped_text_block
,
'dropped_text_block'
:
dropped_text_block
,
'dropped_image_block'
:
dropped_image_block
,
'dropped_image_block'
:
dropped_image_block
,
'dropped_table_block'
:
dropped_table_block
,
'dropped_table_block'
:
dropped_table_block
,
'dropped_bboxes'
:
need_remove_spans_bboxes_dict
,
}
}
return
return_dict
return
return_dict
...
@@ -233,7 +235,8 @@ def parse_pdf_by_ocr(
...
@@ -233,7 +235,8 @@ def parse_pdf_by_ocr(
# 构造pdf_info_dict
# 构造pdf_info_dict
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
page_info
=
construct_page_component
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
inline_equations
,
images
,
tables
,
interline_equations
,
inline_equations
,
dropped_text_block
,
dropped_image_block
,
dropped_table_block
)
dropped_text_block
,
dropped_image_block
,
dropped_table_block
,
need_remove_spans_bboxes_dict
)
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
# 在测试时,保存调试信息
# 在测试时,保存调试信息
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
b6f051d8
...
@@ -60,7 +60,7 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
...
@@ -60,7 +60,7 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
# 遍历spans,将每个span放入对应的layout中
# 遍历spans,将每个span放入对应的layout中
layout_sapns
=
[]
layout_sapns
=
[]
for
span
in
spans
:
for
span
in
spans
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
layout_bbox
)
>
0.
8
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
layout_bbox
)
>
0.
65
:
layout_sapns
.
append
(
span
)
layout_sapns
.
append
(
span
)
# 如果layout_sapns不为空,则放入new_spans中
# 如果layout_sapns不为空,则放入new_spans中
if
len
(
layout_sapns
)
>
0
:
if
len
(
layout_sapns
)
>
0
:
...
...
magic_pdf/pre_proc/ocr_span_list_modify.py
View file @
b6f051d8
...
@@ -37,18 +37,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
...
@@ -37,18 +37,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_text_block
=
[]
dropped_text_block
=
[]
dropped_image_block
=
[]
dropped_image_block
=
[]
dropped_table_block
=
[]
dropped_table_block
=
[]
for
key
,
value
in
need_remove_spans_bboxes_dict
.
items
():
for
drop_tag
,
removed_bboxes
in
need_remove_spans_bboxes_dict
.
items
():
# logger.info(f"remove spans by bbox dict,
key: {key}, value: {value
}")
# logger.info(f"remove spans by bbox dict,
drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes
}")
need_remove_spans
=
[]
need_remove_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
for
removed_bbox
in
value
:
for
removed_bbox
in
removed_bboxes
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
removed_bbox
)
>
0.5
:
need_remove_spans
.
append
(
span
)
need_remove_spans
.
append
(
span
)
break
break
for
span
in
need_remove_spans
:
for
span
in
need_remove_spans
:
spans
.
remove
(
span
)
spans
.
remove
(
span
)
span
[
'tag'
]
=
key
span
[
'tag'
]
=
drop_tag
if
span
[
'type'
]
in
[
'text'
,
'inline_equation'
,
'displayed_equation'
]:
if
span
[
'type'
]
in
[
'text'
,
'inline_equation'
,
'displayed_equation'
]:
dropped_text_block
.
append
(
span
)
dropped_text_block
.
append
(
span
)
elif
span
[
'type'
]
==
'image'
:
elif
span
[
'type'
]
==
'image'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment