Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
c5b27057
"vscode:/vscode.git/clone" did not exist on "3d08c5c035cbff2496f317395d03fd552d08b20a"
Commit
c5b27057
authored
Apr 16, 2024
by
赵小蒙
Browse files
切图逻辑修复
parent
d438b97a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
5 deletions
+19
-5
magic_pdf/libs/pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+0
-5
magic_pdf/pre_proc/cut_image.py
magic_pdf/pre_proc/cut_image.py
+19
-0
No files found.
magic_pdf/libs/pdf_image_tools.py
View file @
c5b27057
from
loguru
import
logger
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.io.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.commons
import
fitz
from
magic_pdf.libs.commons
import
fitz
...
@@ -20,10 +19,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
...
@@ -20,10 +19,6 @@ def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWri
# 新版本生成平铺路径
# 新版本生成平铺路径
img_hash256_path
=
f
"
{
compute_sha256
(
img_path
)
}
.jpg"
img_hash256_path
=
f
"
{
compute_sha256
(
img_path
)
}
.jpg"
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box,
{
bbox
}
"
)
return
img_hash256_path
# 将坐标转换为fitz.Rect对象
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
# 配置缩放倍数为3倍
...
...
magic_pdf/pre_proc/cut_image.py
View file @
c5b27057
from
loguru
import
logger
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.pdf_image_tools
import
cut_image
from
magic_pdf.libs.pdf_image_tools
import
cut_image
...
@@ -10,9 +12,13 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
...
@@ -10,9 +12,13 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
for
span
in
spans
:
for
span
in
spans
:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
if
span_type
==
ContentType
.
Image
:
if
not
check_img_bbox
(
span
[
'bbox'
]):
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
elif
span_type
==
ContentType
.
Table
:
if
not
check_img_bbox
(
span
[
'bbox'
]):
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
imageWriter
=
imageWriter
)
...
@@ -38,15 +44,28 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
...
@@ -38,15 +44,28 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
return
join_path
(
pdf_bytes_md5
,
type
)
return
join_path
(
pdf_bytes_md5
,
type
)
for
bbox
in
image_bboxes
:
for
bbox
in
image_bboxes
:
if
not
check_img_bbox
(
bbox
):
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
images_overlap_backup
:
for
bbox
in
images_overlap_backup
:
if
not
check_img_bbox
(
bbox
):
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"images"
),
imageWriter
)
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
image_backup_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
for
bbox
in
table_bboxes
:
for
bbox
in
table_bboxes
:
if
not
check_img_bbox
(
bbox
):
continue
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
image_path
=
cut_image
(
bbox
,
page_num
,
page
,
return_path
(
"tables"
),
imageWriter
)
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
table_info
.
append
({
"bbox"
:
bbox
,
"image_path"
:
image_path
})
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
return
image_info
,
image_backup_info
,
table_info
,
inline_eq_info
,
interline_eq_info
def
check_img_bbox
(
bbox
)
->
bool
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
"image_bboxes: 错误的box,
{
bbox
}
"
)
return
False
return
True
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment