Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
09269c84
Commit
09269c84
authored
Mar 20, 2024
by
许瑞
Browse files
feat: add extract_train_data
parent
056aed86
Changes
7
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1591 additions
and
134 deletions
+1591
-134
magic_pdf/pdf_parse_for_train.py
magic_pdf/pdf_parse_for_train.py
+684
-0
magic_pdf/pipeline.py
magic_pdf/pipeline.py
+310
-134
magic_pdf/train_utils/__init__.py
magic_pdf/train_utils/__init__.py
+0
-0
magic_pdf/train_utils/convert_to_train_format.py
magic_pdf/train_utils/convert_to_train_format.py
+52
-0
magic_pdf/train_utils/extract_caption.py
magic_pdf/train_utils/extract_caption.py
+59
-0
magic_pdf/train_utils/remove_footer_header.py
magic_pdf/train_utils/remove_footer_header.py
+159
-0
magic_pdf/train_utils/vis_utils.py
magic_pdf/train_utils/vis_utils.py
+327
-0
No files found.
magic_pdf/pdf_parse_for_train.py
0 → 100644
View file @
09269c84
This diff is collapsed.
Click to expand it.
magic_pdf/pipeline.py
View file @
09269c84
This diff is collapsed.
Click to expand it.
magic_pdf/train_utils/__init__.py
0 → 100644
View file @
09269c84
magic_pdf/train_utils/convert_to_train_format.py
0 → 100644
View file @
09269c84
def
convert_to_train_format
(
jso
:
dict
)
->
[]:
pages
=
[]
for
k
,
v
in
jso
.
items
():
page_idx
=
v
[
"page_idx"
]
width
,
height
=
v
[
"page_size"
]
info
=
{
"page_info"
:
{
"page_no"
:
page_idx
,
"height"
:
height
,
"width"
:
width
}}
bboxes
:
list
[
dict
]
=
[]
for
img_bbox
in
v
[
"image_bboxes_with_caption"
]:
bbox
=
{
"category_id"
:
1
,
"bbox"
:
img_bbox
[
"bbox"
]}
if
"caption"
in
img_bbox
:
bbox
[
"caption_bbox"
]
=
img_bbox
[
"caption"
]
bboxes
.
append
(
bbox
)
for
tbl_bbox
in
v
[
"table_bboxes_with_caption"
]:
bbox
=
{
"category_id"
:
7
,
"bbox"
:
tbl_bbox
[
"bbox"
]}
if
"caption"
in
tbl_bbox
:
bbox
[
"caption_bbox"
]
=
tbl_bbox
[
"caption"
]
bboxes
.
append
(
bbox
)
for
bbox
in
v
[
"bak_page_no_bboxes"
]:
n_bbox
=
{
"category_id"
:
4
,
"bbox"
:
bbox
}
bboxes
.
append
(
n_bbox
)
for
bbox
in
v
[
"bak_header_bboxes"
]:
n_bbox
=
{
"category_id"
:
3
,
"bbox"
:
bbox
}
bboxes
.
append
(
n_bbox
)
for
bbox
in
v
[
"bak_footer_bboxes"
]:
n_bbox
=
{
"category_id"
:
6
,
"bbox"
:
bbox
}
bboxes
.
append
(
n_bbox
)
# 脚注, 目前没有看到例子
for
para
in
v
[
"para_blocks"
]:
n_bbox
=
{
"category_id"
:
2
,
"bbox"
:
para
[
"bbox"
]}
bboxes
.
append
(
n_bbox
)
for
inline_equation
in
v
[
"inline_equations"
]:
n_bbox
=
{
"category_id"
:
13
,
"bbox"
:
inline_equation
[
"bbox"
]}
bboxes
.
append
(
n_bbox
)
for
inter_equation
in
v
[
"interline_equations"
]:
n_bbox
=
{
"category_id"
:
10
,
"bbox"
:
inter_equation
[
"bbox"
]}
bboxes
.
append
(
n_bbox
)
info
[
"bboxes"
]
=
bboxes
pages
.
append
(
info
)
return
pages
magic_pdf/train_utils/extract_caption.py
0 → 100644
View file @
09269c84
from
magic_pdf.libs.boxbase
import
_is_in
def
extract_caption_bbox
(
outer
:
list
,
inner
:
list
)
->
list
:
"""
ret: list of {
"bbox": [1,2,3,4],
"caption": [5,6,7,8] # may existed
}
"""
found_count
=
0
# for debug
print
(
outer
,
inner
)
def
is_float_equal
(
a
,
b
):
if
0.01
>
abs
(
a
-
b
):
# non strict float equal compare
return
True
return
False
outer_h
=
{
i
:
outer
[
i
]
for
i
in
range
(
len
(
outer
))}
ret
=
[]
for
v
in
inner
:
ix0
,
iy0
,
ix1
,
iy1
=
v
found_idx
=
None
d
=
{
"bbox"
:
v
[:
4
]}
for
k
in
outer_h
:
ox0
,
oy0
,
ox1
,
oy1
=
outer_h
[
k
]
equal_float_flags
=
[
is_float_equal
(
ix0
,
ox0
),
is_float_equal
(
iy0
,
oy0
),
is_float_equal
(
ix1
,
ox1
),
is_float_equal
(
iy1
,
oy1
),
]
if
_is_in
(
v
,
outer_h
[
k
])
and
not
all
(
equal_float_flags
):
found_idx
=
k
break
if
found_idx
is
not
None
:
found_count
+=
1
captions
:
list
[
list
]
=
[]
ox0
,
oy0
,
ox1
,
oy1
=
outer_h
[
found_idx
]
captions
=
[
[
ox0
,
oy0
,
ix0
,
oy1
],
[
ox0
,
oy0
,
ox1
,
iy0
],
[
ox0
,
iy1
,
ox1
,
oy1
],
[
ix1
,
oy0
,
ox1
,
oy1
],
]
captions
=
sorted
(
captions
,
key
=
lambda
rect
:
abs
(
rect
[
0
]
-
rect
[
2
])
*
abs
(
rect
[
1
]
-
rect
[
3
]),
)
# 面积最大的框就是caption
d
[
"caption"
]
=
captions
[
-
1
]
outer_h
.
pop
(
found_idx
)
# 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。
ret
.
append
(
d
)
print
(
"found_count: "
,
found_count
)
return
ret
magic_pdf/train_utils/remove_footer_header.py
0 → 100644
View file @
09269c84
import
re
from
magic_pdf.libs.boxbase
import
_is_in_or_part_overlap
from
magic_pdf.libs.drop_tag
import
CONTENT_IN_FOOT_OR_HEADER
,
PAGE_NO
"""
copy from pre_proc/remove_footer_header.py
"""
def
remove_headder_footer_one_page
(
text_raw_blocks
,
image_bboxes
,
table_bboxes
,
header_bboxs
,
footer_bboxs
,
page_no_bboxs
,
page_w
,
page_h
,
):
"""
删除页眉页脚,页码
从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
"""
if
1
:
return
image_bboxes
,
table_bboxes
,
text_raw_blocks
,
[],
[],
[]
header
=
[]
footer
=
[]
if
len
(
header
)
==
0
:
model_header
=
header_bboxs
if
model_header
:
x0
=
min
([
x
for
x
,
_
,
_
,
_
in
model_header
])
y0
=
min
([
y
for
_
,
y
,
_
,
_
in
model_header
])
x1
=
max
([
x1
for
_
,
_
,
x1
,
_
in
model_header
])
y1
=
max
([
y1
for
_
,
_
,
_
,
y1
in
model_header
])
header
=
[
x0
,
y0
,
x1
,
y1
]
if
len
(
footer
)
==
0
:
model_footer
=
footer_bboxs
if
model_footer
:
x0
=
min
([
x
for
x
,
_
,
_
,
_
in
model_footer
])
y0
=
min
([
y
for
_
,
y
,
_
,
_
in
model_footer
])
x1
=
max
([
x1
for
_
,
_
,
x1
,
_
in
model_footer
])
y1
=
max
([
y1
for
_
,
_
,
_
,
y1
in
model_footer
])
footer
=
[
x0
,
y0
,
x1
,
y1
]
header_y0
=
0
if
len
(
header
)
==
0
else
header
[
3
]
footer_y0
=
page_h
if
len
(
footer
)
==
0
else
footer
[
1
]
if
page_no_bboxs
:
top_part
=
[
b
for
b
in
page_no_bboxs
if
b
[
3
]
<
page_h
/
2
]
btn_part
=
[
b
for
b
in
page_no_bboxs
if
b
[
1
]
>
page_h
/
2
]
top_max_y0
=
max
([
b
[
1
]
for
b
in
top_part
])
if
top_part
else
0
btn_min_y1
=
min
([
b
[
3
]
for
b
in
btn_part
])
if
btn_part
else
page_h
header_y0
=
max
(
header_y0
,
top_max_y0
)
footer_y0
=
min
(
footer_y0
,
btn_min_y1
)
content_boundry
=
[
0
,
header_y0
,
page_w
,
footer_y0
]
header
=
[
0
,
0
,
page_w
,
header_y0
]
footer
=
[
0
,
footer_y0
,
page_w
,
page_h
]
"""以上计算出来了页眉页脚的边界,下面开始进行删除"""
text_block_to_remove
=
[]
# 首先检查每个textblock
for
blk
in
text_raw_blocks
:
if
len
(
blk
[
"lines"
])
>
0
:
for
line
in
blk
[
"lines"
]:
line_del
=
[]
for
span
in
line
[
"spans"
]:
span_del
=
[]
if
span
[
"bbox"
][
3
]
<
header_y0
:
span_del
.
append
(
span
)
elif
_is_in_or_part_overlap
(
span
[
"bbox"
],
header
)
or
_is_in_or_part_overlap
(
span
[
"bbox"
],
footer
):
span_del
.
append
(
span
)
for
span
in
span_del
:
line
[
"spans"
].
remove
(
span
)
if
not
line
[
"spans"
]:
line_del
.
append
(
line
)
for
line
in
line_del
:
blk
[
"lines"
].
remove
(
line
)
else
:
# if not blk['lines']:
blk
[
"tag"
]
=
CONTENT_IN_FOOT_OR_HEADER
text_block_to_remove
.
append
(
blk
)
"""有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
page_no_block_2_remove
=
[]
if
page_no_bboxs
:
for
pagenobox
in
page_no_bboxs
:
for
block
in
text_raw_blocks
:
if
_is_in_or_part_overlap
(
pagenobox
,
block
[
"bbox"
]
):
# 在span级别删除页码
for
line
in
block
[
"lines"
]:
for
span
in
line
[
"spans"
]:
if
_is_in_or_part_overlap
(
pagenobox
,
span
[
"bbox"
]):
# span['text'] = ''
span
[
"tag"
]
=
PAGE_NO
# 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
if
len
(
line
[
"spans"
])
==
1
and
len
(
block
[
"lines"
])
==
1
:
page_no_block_2_remove
.
append
(
block
)
else
:
# 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
if
len
(
text_raw_blocks
)
>
0
:
text_raw_blocks
.
sort
(
key
=
lambda
x
:
x
[
"bbox"
][
1
],
reverse
=
True
)
last_block
=
text_raw_blocks
[
0
]
if
len
(
last_block
[
"lines"
])
==
1
:
last_line
=
last_block
[
"lines"
][
0
]
if
len
(
last_line
[
"spans"
])
==
1
:
last_span
=
last_line
[
"spans"
][
0
]
if
(
last_span
[
"text"
].
strip
()
and
not
re
.
search
(
"[a-zA-Z]"
,
last_span
[
"text"
])
and
re
.
search
(
"[0-9]"
,
last_span
[
"text"
])
):
last_span
[
"tag"
]
=
PAGE_NO
page_no_block_2_remove
.
append
(
last_block
)
for
b
in
page_no_block_2_remove
:
text_block_to_remove
.
append
(
b
)
for
blk
in
text_block_to_remove
:
if
blk
in
text_raw_blocks
:
text_raw_blocks
.
remove
(
blk
)
text_block_remain
=
text_raw_blocks
image_bbox_to_remove
=
[
bbox
for
bbox
in
image_bboxes
if
not
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
image_bbox_remain
=
[
bbox
for
bbox
in
image_bboxes
if
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
table_bbox_to_remove
=
[
bbox
for
bbox
in
table_bboxes
if
not
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
table_bbox_remain
=
[
bbox
for
bbox
in
table_bboxes
if
_is_in_or_part_overlap
(
bbox
,
content_boundry
)
]
# 1, 2, 3
return
(
image_bbox_remain
,
table_bbox_remain
,
text_block_remain
,
text_block_to_remove
,
image_bbox_to_remove
,
table_bbox_to_remove
,
)
magic_pdf/train_utils/vis_utils.py
0 → 100644
View file @
09269c84
from
magic_pdf.libs.commons
import
fitz
import
os
from
magic_pdf.libs.coordinate_transform
import
get_scale_ratio
def
draw_model_output
(
raw_pdf_doc
:
fitz
.
Document
,
paras_dict_arr
:
list
[
dict
],
save_path
:
str
):
"""
在page上画出bbox,保存到save_path
"""
"""
# {0: 'title', # 标题
# 1: 'figure', # 图片
# 2: 'plain text', # 文本
# 3: 'header', # 页眉
# 4: 'page number', # 页码
# 5: 'footnote', # 脚注
# 6: 'footer', # 页脚
# 7: 'table', # 表格
# 8: 'table caption', # 表格描述
# 9: 'figure caption', # 图片描述
# 10: 'equation', # 公式
# 11: 'full column', # 单栏
# 12: 'sub column', # 多栏
# 13: 'embedding', # 嵌入公式
# 14: 'isolated'} # 单行公式
"""
color_map
=
{
"body"
:
fitz
.
pdfcolor
[
"green"
],
"non_body"
:
fitz
.
pdfcolor
[
"red"
],
}
"""
{"layout_dets": [], "subfield_dets": [], "page_info": {"page_no": 22, "height": 1650, "width": 1275}}
"""
for
i
,
page
in
enumerate
(
raw_pdf_doc
):
v
=
paras_dict_arr
[
i
]
page_idx
=
v
[
"page_info"
][
"page_no"
]
width
=
v
[
"page_info"
][
"width"
]
height
=
v
[
"page_info"
][
"height"
]
horizontal_scale_ratio
,
vertical_scale_ratio
=
get_scale_ratio
(
paras_dict_arr
[
i
],
page
)
for
order
,
block
in
enumerate
(
v
[
"layout_dets"
]):
L
=
block
[
"poly"
][
0
]
/
horizontal_scale_ratio
U
=
block
[
"poly"
][
1
]
/
vertical_scale_ratio
R
=
block
[
"poly"
][
2
]
/
horizontal_scale_ratio
D
=
block
[
"poly"
][
5
]
/
vertical_scale_ratio
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
# R += pageL
# U += pageU
# D += pageU
L
,
R
=
min
(
L
,
R
),
max
(
L
,
R
)
U
,
D
=
min
(
U
,
D
),
max
(
U
,
D
)
bbox
=
[
L
,
U
,
R
,
D
]
color
=
color_map
[
"body"
]
if
block
[
"category_id"
]
in
(
3
,
4
,
5
,
6
,
0
):
color
=
color_map
[
"non_body"
]
rect
=
fitz
.
Rect
(
bbox
)
page
.
draw_rect
(
rect
,
fill
=
None
,
width
=
0.5
,
overlay
=
True
,
color
=
color
)
parent_dir
=
os
.
path
.
dirname
(
save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
raw_pdf_doc
.
save
(
save_path
)
def
debug_show_bbox
(
raw_pdf_doc
:
fitz
.
Document
,
page_idx
:
int
,
bboxes
:
list
,
droped_bboxes
:
list
,
expect_drop_bboxes
:
list
,
save_path
:
str
,
expected_page_id
:
int
,
):
"""
以覆盖的方式写个临时的pdf,用于debug
"""
if
page_idx
!=
expected_page_id
:
return
if
os
.
path
.
exists
(
save_path
):
# 删除已经存在的文件
os
.
remove
(
save_path
)
# 创建一个新的空白 PDF 文件
doc
=
fitz
.
open
(
""
)
width
=
raw_pdf_doc
[
page_idx
].
rect
.
width
height
=
raw_pdf_doc
[
page_idx
].
rect
.
height
new_page
=
doc
.
new_page
(
width
=
width
,
height
=
height
)
shape
=
new_page
.
new_shape
()
for
bbox
in
bboxes
:
# 原始box画上去
rect
=
fitz
.
Rect
(
*
bbox
[
0
:
4
])
shape
=
new_page
.
new_shape
()
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
fitz
.
pdfcolor
[
"red"
],
fill
=
fitz
.
pdfcolor
[
"blue"
],
fill_opacity
=
0.2
)
shape
.
finish
()
shape
.
commit
()
for
bbox
in
droped_bboxes
:
# 原始box画上去
rect
=
fitz
.
Rect
(
*
bbox
[
0
:
4
])
shape
=
new_page
.
new_shape
()
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
None
,
fill
=
fitz
.
pdfcolor
[
"yellow"
],
fill_opacity
=
0.2
)
shape
.
finish
()
shape
.
commit
()
for
bbox
in
expect_drop_bboxes
:
# 原始box画上去
rect
=
fitz
.
Rect
(
*
bbox
[
0
:
4
])
shape
=
new_page
.
new_shape
()
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
fitz
.
pdfcolor
[
"red"
],
fill
=
None
)
shape
.
finish
()
shape
.
commit
()
# shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
# color=(0, 0, 0))
# shape.finish(color=fitz.pdfcolor['black'])
# shape.commit()
parent_dir
=
os
.
path
.
dirname
(
save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
doc
.
save
(
save_path
)
doc
.
close
()
def
debug_show_page
(
page
,
bboxes1
:
list
,
bboxes2
:
list
,
bboxes3
:
list
,
):
save_path
=
"./tmp/debug.pdf"
if
os
.
path
.
exists
(
save_path
):
# 删除已经存在的文件
os
.
remove
(
save_path
)
# 创建一个新的空白 PDF 文件
doc
=
fitz
.
open
(
""
)
width
=
page
.
rect
.
width
height
=
page
.
rect
.
height
new_page
=
doc
.
new_page
(
width
=
width
,
height
=
height
)
shape
=
new_page
.
new_shape
()
for
bbox
in
bboxes1
:
# 原始box画上去
rect
=
fitz
.
Rect
(
*
bbox
[
0
:
4
])
shape
=
new_page
.
new_shape
()
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
fitz
.
pdfcolor
[
"red"
],
fill
=
fitz
.
pdfcolor
[
"blue"
],
fill_opacity
=
0.2
)
shape
.
finish
()
shape
.
commit
()
for
bbox
in
bboxes2
:
# 原始box画上去
rect
=
fitz
.
Rect
(
*
bbox
[
0
:
4
])
shape
=
new_page
.
new_shape
()
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
None
,
fill
=
fitz
.
pdfcolor
[
"yellow"
],
fill_opacity
=
0.2
)
shape
.
finish
()
shape
.
commit
()
for
bbox
in
bboxes3
:
# 原始box画上去
rect
=
fitz
.
Rect
(
*
bbox
[
0
:
4
])
shape
=
new_page
.
new_shape
()
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
fitz
.
pdfcolor
[
"red"
],
fill
=
None
)
shape
.
finish
()
shape
.
commit
()
parent_dir
=
os
.
path
.
dirname
(
save_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
doc
.
save
(
save_path
)
doc
.
close
()
def
draw_layout_bbox_on_page
(
raw_pdf_doc
:
fitz
.
Document
,
paras_dict
:
dict
,
header
,
footer
,
pdf_path
:
str
):
"""
在page上画出bbox,保存到save_path
"""
# 检查文件是否存在
is_new_pdf
=
False
if
os
.
path
.
exists
(
pdf_path
):
# 打开现有的 PDF 文件
doc
=
fitz
.
open
(
pdf_path
)
else
:
# 创建一个新的空白 PDF 文件
is_new_pdf
=
True
doc
=
fitz
.
open
(
""
)
for
k
,
v
in
paras_dict
.
items
():
page_idx
=
v
[
"page_idx"
]
layouts
=
v
[
"layout_bboxes"
]
page
=
doc
[
page_idx
]
shape
=
page
.
new_shape
()
for
order
,
layout
in
enumerate
(
layouts
):
border_offset
=
1
rect_box
=
layout
[
"layout_bbox"
]
layout_label
=
layout
[
"layout_label"
]
fill_color
=
fitz
.
pdfcolor
[
"pink"
]
if
layout_label
==
"U"
else
None
rect_box
=
[
rect_box
[
0
]
+
1
,
rect_box
[
1
]
-
border_offset
,
rect_box
[
2
]
-
1
,
rect_box
[
3
]
+
border_offset
,
]
rect
=
fitz
.
Rect
(
*
rect_box
)
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
fitz
.
pdfcolor
[
"red"
],
fill
=
fill_color
,
fill_opacity
=
0.4
)
"""
draw order text on layout box
"""
font_size
=
10
shape
.
insert_text
(
(
rect_box
[
0
]
+
1
,
rect_box
[
1
]
+
font_size
),
f
"
{
order
}
"
,
fontsize
=
font_size
,
color
=
(
0
,
0
,
0
),
)
"""画上footer header"""
if
header
:
shape
.
draw_rect
(
fitz
.
Rect
(
header
))
shape
.
finish
(
color
=
None
,
fill
=
fitz
.
pdfcolor
[
"black"
],
fill_opacity
=
0.2
)
if
footer
:
shape
.
draw_rect
(
fitz
.
Rect
(
footer
))
shape
.
finish
(
color
=
None
,
fill
=
fitz
.
pdfcolor
[
"black"
],
fill_opacity
=
0.2
)
shape
.
commit
()
if
is_new_pdf
:
doc
.
save
(
pdf_path
)
else
:
doc
.
saveIncr
()
doc
.
close
()
@
DeprecationWarning
def
draw_layout_on_page
(
raw_pdf_doc
:
fitz
.
Document
,
page_idx
:
int
,
page_layout
:
list
,
pdf_path
:
str
):
"""
把layout的box用红色边框花在pdf_path的page_idx上
"""
def
draw
(
shape
,
layout
,
fill_color
=
fitz
.
pdfcolor
[
"pink"
]):
border_offset
=
1
rect_box
=
layout
[
"layout_bbox"
]
layout_label
=
layout
[
"layout_label"
]
sub_layout
=
layout
[
"sub_layout"
]
if
len
(
sub_layout
)
==
0
:
fill_color
=
fill_color
if
layout_label
==
"U"
else
None
rect_box
=
[
rect_box
[
0
]
+
1
,
rect_box
[
1
]
-
border_offset
,
rect_box
[
2
]
-
1
,
rect_box
[
3
]
+
border_offset
,
]
rect
=
fitz
.
Rect
(
*
rect_box
)
shape
.
draw_rect
(
rect
)
shape
.
finish
(
color
=
fitz
.
pdfcolor
[
"red"
],
fill
=
fill_color
,
fill_opacity
=
0.2
)
# if layout_label=='U':
# bad_boxes = layout.get("bad_boxes", [])
# for bad_box in bad_boxes:
# rect = fitz.Rect(*bad_box)
# shape.draw_rect(rect)
# shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
# else:
# rect = fitz.Rect(*rect_box)
# shape.draw_rect(rect)
# shape.finish(color=fitz.pdfcolor['blue'])
for
sub_layout
in
sub_layout
:
draw
(
shape
,
sub_layout
)
shape
.
commit
()
# 检查文件是否存在
is_new_pdf
=
False
if
os
.
path
.
exists
(
pdf_path
):
# 打开现有的 PDF 文件
doc
=
fitz
.
open
(
pdf_path
)
else
:
# 创建一个新的空白 PDF 文件
is_new_pdf
=
True
doc
=
fitz
.
open
(
""
)
page
=
doc
[
page_idx
]
shape
=
page
.
new_shape
()
for
order
,
layout
in
enumerate
(
page_layout
):
draw
(
shape
,
layout
,
fitz
.
pdfcolor
[
"yellow"
])
# shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
# color=(0, 0, 0))
# shape.finish(color=fitz.pdfcolor['black'])
# shape.commit()
parent_dir
=
os
.
path
.
dirname
(
pdf_path
)
if
not
os
.
path
.
exists
(
parent_dir
):
os
.
makedirs
(
parent_dir
)
if
is_new_pdf
:
doc
.
save
(
pdf_path
)
else
:
doc
.
saveIncr
()
doc
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment