Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
389826c5
Commit
389826c5
authored
Jun 18, 2024
by
赵小蒙
Browse files
update custom model framework
parent
c96aa88d
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
75 additions
and
64 deletions
+75
-64
magic_pdf/model/360_layout_analysis.py
magic_pdf/model/360_layout_analysis.py
+0
-0
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+61
-0
magic_pdf/model/model_list.py
magic_pdf/model/model_list.py
+2
-0
magic_pdf/model/pp_structure_v2.py
magic_pdf/model/pp_structure_v2.py
+2
-58
magic_pdf/pipe/OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+1
-1
magic_pdf/pipe/TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+1
-1
magic_pdf/pipe/UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+1
-1
magic_pdf/user_api.py
magic_pdf/user_api.py
+7
-3
No files found.
magic_pdf/model/
doc_analyze_by_
360layout.py
→
magic_pdf/model/360
_
layout
_analysis
.py
View file @
389826c5
File moved
magic_pdf/model/doc_analyze_by_custom_model.py
0 → 100644
View file @
389826c5
import
fitz
import
cv2
from
PIL
import
Image
import
numpy
as
np
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.pp_structure_v2
import
CustomPaddleModel
def
dict_compare
(
d1
,
d2
):
return
d1
.
items
()
==
d2
.
items
()
def
remove_duplicates_dicts
(
lst
):
unique_dicts
=
[]
for
dict_item
in
lst
:
if
not
any
(
dict_compare
(
dict_item
,
existing_dict
)
for
existing_dict
in
unique_dicts
):
unique_dicts
.
append
(
dict_item
)
return
unique_dicts
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
)
->
list
:
images
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
for
index
in
range
(
0
,
doc
.
page_count
):
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img
=
Image
.
frombytes
(
"RGB"
,
[
pm
.
width
,
pm
.
height
],
pm
.
samples
)
img
=
cv2
.
cvtColor
(
np
.
array
(
img
),
cv2
.
COLOR_RGB2BGR
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
images
.
append
(
img_dict
)
return
images
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
model
=
MODEL
.
Paddle
):
images
=
load_images_from_pdf
(
pdf_bytes
)
custom_model
=
None
if
model
==
MODEL
.
Paddle
:
custom_model
=
CustomPaddleModel
(
ocr
=
ocr
,
show_log
=
show_log
)
else
:
pass
model_json
=
[]
for
index
,
img_dict
in
enumerate
(
images
):
img
=
img_dict
[
"img"
]
page_width
=
img_dict
[
"width"
]
page_height
=
img_dict
[
"height"
]
result
=
custom_model
(
img
)
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
return
model_json
magic_pdf/model/model_list.py
0 → 100644
View file @
389826c5
class
MODEL
:
Paddle
=
"pp_structure_v2"
magic_pdf/model/
doc_analyze_by_
pp_structurev2.py
→
magic_pdf/model/pp_structure
_
v2.py
View file @
389826c5
import
random
import
fitz
import
cv2
from
paddleocr
import
PPStructure
from
PIL
import
Image
from
loguru
import
logger
import
numpy
as
np
from
paddleocr
import
PPStructure
def
region_to_bbox
(
region
):
...
...
@@ -16,41 +12,8 @@ def region_to_bbox(region):
return
[
x0
,
y0
,
x1
,
y1
]
def
dict_compare
(
d1
,
d2
):
return
d1
.
items
()
==
d2
.
items
()
def
remove_duplicates_dicts
(
lst
):
unique_dicts
=
[]
for
dict_item
in
lst
:
if
not
any
(
dict_compare
(
dict_item
,
existing_dict
)
for
existing_dict
in
unique_dicts
):
unique_dicts
.
append
(
dict_item
)
return
unique_dicts
def
load_imags_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
):
imgs
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
for
index
in
range
(
0
,
doc
.
page_count
):
page
=
doc
[
index
]
dpi
=
200
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img
=
Image
.
frombytes
(
"RGB"
,
[
pm
.
width
,
pm
.
height
],
pm
.
samples
)
img
=
cv2
.
cvtColor
(
np
.
array
(
img
),
cv2
.
COLOR_RGB2BGR
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
imgs
.
append
(
img_dict
)
class
CustomPaddleModel
:
def
__init__
_
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
def
__init__
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
self
.
model
=
PPStructure
(
table
=
False
,
ocr
=
ocr
,
show_log
=
show_log
)
def
__call__
(
self
,
img
):
...
...
@@ -109,23 +72,4 @@ class CustomPaddleModel:
if
len
(
spans
)
>
0
:
result
.
extend
(
spans
)
result
=
remove_duplicates_dicts
(
result
)
return
result
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
imgs
=
load_imags_from_pdf
(
pdf_bytes
)
custom_paddle
=
CustomPaddleModel
()
model_json
=
[]
for
index
,
img_dict
in
enumerate
(
imgs
):
img
=
img_dict
[
"img"
]
page_width
=
img_dict
[
"width"
]
page_height
=
img_dict
[
"height"
]
result
=
custom_paddle
(
img
)
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
return
model_json
magic_pdf/pipe/OCRPipe.py
View file @
389826c5
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.user_api
import
parse_ocr_pdf
...
...
magic_pdf/pipe/TXTPipe.py
View file @
389826c5
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
...
...
magic_pdf/pipe/UNIPipe.py
View file @
389826c5
...
...
@@ -3,7 +3,7 @@ import json
from
loguru
import
logger
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
...
...
magic_pdf/user_api.py
View file @
389826c5
...
...
@@ -16,7 +16,7 @@ import re
from
loguru
import
logger
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt_v2
import
parse_pdf_by_txt
...
...
@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return
garbage_count
/
total
def
calculate_not_printable_rate
(
text
):
printable
=
sum
(
1
for
c
in
text
if
c
.
isprintable
())
printable_text
=
""
for
c
in
text
:
if
c
.
isprintable
():
printable_text
+=
c
printable_total
=
len
(
printable_text
)
total
=
len
(
text
)
if
total
==
0
:
return
0
# 避免除以零的错误
return
(
total
-
printable
)
/
total
return
(
total
-
printable
_total
)
/
total
not_common_character_rate
=
calculate_not_common_character_rate
(
text_all
)
not_printable_rate
=
calculate_not_printable_rate
(
text_all
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment