Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a35785b9
"...resnet50_tensorflow.git" did not exist on "c162f7abef149f3b193e106cfa275e41ea7233f7"
Unverified
Commit
a35785b9
authored
Dec 02, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 02, 2024
Browse files
Merge pull request #1167 from opendatalab/dev
Dev -> 0.10.5
parents
384e0379
a7296f78
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
4 additions
and
4 deletions
+4
-4
magic_pdf/data/utils.py
magic_pdf/data/utils.py
+2
-2
magic_pdf/pre_proc/cut_image.py
magic_pdf/pre_proc/cut_image.py
+2
-2
No files found.
magic_pdf/data/utils.py
View file @
a35785b9
...
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
...
@@ -20,8 +20,8 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
doc
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
pm
=
doc
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds
90
00 after scaling, do not scale further.
# If the width or height exceeds
45
00 after scaling, do not scale further.
if
pm
.
width
>
90
00
or
pm
.
height
>
90
00
:
if
pm
.
width
>
45
00
or
pm
.
height
>
45
00
:
pm
=
doc
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
pm
=
doc
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
img
=
Image
.
frombytes
(
'RGB'
,
(
pm
.
width
,
pm
.
height
),
pm
.
samples
)
img
=
Image
.
frombytes
(
'RGB'
,
(
pm
.
width
,
pm
.
height
),
pm
.
samples
)
...
...
magic_pdf/pre_proc/cut_image.py
View file @
a35785b9
...
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
...
@@ -12,12 +12,12 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
for
span
in
spans
:
for
span
in
spans
:
span_type
=
span
[
'type'
]
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
if
span_type
==
ContentType
.
Image
:
if
not
check_img_bbox
(
span
[
'bbox'
]):
if
not
check_img_bbox
(
span
[
'bbox'
])
or
not
imageWriter
:
continue
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
elif
span_type
==
ContentType
.
Table
:
if
not
check_img_bbox
(
span
[
'bbox'
]):
if
not
check_img_bbox
(
span
[
'bbox'
])
or
not
imageWriter
:
continue
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
imageWriter
=
imageWriter
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment