Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
389826c5
"vscode:/vscode.git/clone" did not exist on "fedd03869a0d3dead88d48acb777b7a8bca3035e"
Commit
389826c5
authored
Jun 18, 2024
by
赵小蒙
Browse files
update custom model framework
parent
c96aa88d
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
75 additions
and
64 deletions
+75
-64
magic_pdf/model/360_layout_analysis.py
magic_pdf/model/360_layout_analysis.py
+0
-0
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+61
-0
magic_pdf/model/model_list.py
magic_pdf/model/model_list.py
+2
-0
magic_pdf/model/pp_structure_v2.py
magic_pdf/model/pp_structure_v2.py
+2
-58
magic_pdf/pipe/OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+1
-1
magic_pdf/pipe/TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+1
-1
magic_pdf/pipe/UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+1
-1
magic_pdf/user_api.py
magic_pdf/user_api.py
+7
-3
No files found.
magic_pdf/model/
doc_analyze_by_
360layout.py
→
magic_pdf/model/360
_
layout
_analysis
.py
View file @
389826c5
File moved
magic_pdf/model/doc_analyze_by_custom_model.py
0 → 100644
View file @
389826c5
import
fitz
import
cv2
from
PIL
import
Image
import
numpy
as
np
from
magic_pdf.model.model_list
import
MODEL
from
magic_pdf.model.pp_structure_v2
import
CustomPaddleModel
def
dict_compare
(
d1
,
d2
):
return
d1
.
items
()
==
d2
.
items
()
def
remove_duplicates_dicts
(
lst
):
unique_dicts
=
[]
for
dict_item
in
lst
:
if
not
any
(
dict_compare
(
dict_item
,
existing_dict
)
for
existing_dict
in
unique_dicts
):
unique_dicts
.
append
(
dict_item
)
return
unique_dicts
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
)
->
list
:
images
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
for
index
in
range
(
0
,
doc
.
page_count
):
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img
=
Image
.
frombytes
(
"RGB"
,
[
pm
.
width
,
pm
.
height
],
pm
.
samples
)
img
=
cv2
.
cvtColor
(
np
.
array
(
img
),
cv2
.
COLOR_RGB2BGR
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
images
.
append
(
img_dict
)
return
images
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
model
=
MODEL
.
Paddle
):
images
=
load_images_from_pdf
(
pdf_bytes
)
custom_model
=
None
if
model
==
MODEL
.
Paddle
:
custom_model
=
CustomPaddleModel
(
ocr
=
ocr
,
show_log
=
show_log
)
else
:
pass
model_json
=
[]
for
index
,
img_dict
in
enumerate
(
images
):
img
=
img_dict
[
"img"
]
page_width
=
img_dict
[
"width"
]
page_height
=
img_dict
[
"height"
]
result
=
custom_model
(
img
)
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
return
model_json
magic_pdf/model/model_list.py
0 → 100644
View file @
389826c5
class
MODEL
:
Paddle
=
"pp_structure_v2"
magic_pdf/model/
doc_analyze_by_
pp_structurev2.py
→
magic_pdf/model/pp_structure
_
v2.py
View file @
389826c5
import
random
import
random
import
fitz
import
cv2
from
paddleocr
import
PPStructure
from
PIL
import
Image
from
loguru
import
logger
from
loguru
import
logger
import
numpy
as
np
from
paddleocr
import
PPStructure
def
region_to_bbox
(
region
):
def
region_to_bbox
(
region
):
...
@@ -16,41 +12,8 @@ def region_to_bbox(region):
...
@@ -16,41 +12,8 @@ def region_to_bbox(region):
return
[
x0
,
y0
,
x1
,
y1
]
return
[
x0
,
y0
,
x1
,
y1
]
def
dict_compare
(
d1
,
d2
):
return
d1
.
items
()
==
d2
.
items
()
def
remove_duplicates_dicts
(
lst
):
unique_dicts
=
[]
for
dict_item
in
lst
:
if
not
any
(
dict_compare
(
dict_item
,
existing_dict
)
for
existing_dict
in
unique_dicts
):
unique_dicts
.
append
(
dict_item
)
return
unique_dicts
def
load_imags_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
):
imgs
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
for
index
in
range
(
0
,
doc
.
page_count
):
page
=
doc
[
index
]
dpi
=
200
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img
=
Image
.
frombytes
(
"RGB"
,
[
pm
.
width
,
pm
.
height
],
pm
.
samples
)
img
=
cv2
.
cvtColor
(
np
.
array
(
img
),
cv2
.
COLOR_RGB2BGR
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
imgs
.
append
(
img_dict
)
class
CustomPaddleModel
:
class
CustomPaddleModel
:
def
__init__
_
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
def
__init__
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
self
.
model
=
PPStructure
(
table
=
False
,
ocr
=
ocr
,
show_log
=
show_log
)
self
.
model
=
PPStructure
(
table
=
False
,
ocr
=
ocr
,
show_log
=
show_log
)
def
__call__
(
self
,
img
):
def
__call__
(
self
,
img
):
...
@@ -109,23 +72,4 @@ class CustomPaddleModel:
...
@@ -109,23 +72,4 @@ class CustomPaddleModel:
if
len
(
spans
)
>
0
:
if
len
(
spans
)
>
0
:
result
.
extend
(
spans
)
result
.
extend
(
spans
)
result
=
remove_duplicates_dicts
(
result
)
return
result
return
result
def
doc_analyze
(
pdf_bytes
:
bytes
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
):
imgs
=
load_imags_from_pdf
(
pdf_bytes
)
custom_paddle
=
CustomPaddleModel
()
model_json
=
[]
for
index
,
img_dict
in
enumerate
(
imgs
):
img
=
img_dict
[
"img"
]
page_width
=
img_dict
[
"width"
]
page_height
=
img_dict
[
"height"
]
result
=
custom_paddle
(
img
)
page_info
=
{
"page_no"
:
index
,
"height"
:
page_height
,
"width"
:
page_width
}
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
return
model_json
magic_pdf/pipe/OCRPipe.py
View file @
389826c5
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.user_api
import
parse_ocr_pdf
from
magic_pdf.user_api
import
parse_ocr_pdf
...
...
magic_pdf/pipe/TXTPipe.py
View file @
389826c5
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.libs.json_compressor
import
JsonCompressor
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
from
magic_pdf.pipe.AbsPipe
import
AbsPipe
...
...
magic_pdf/pipe/UNIPipe.py
View file @
389826c5
...
@@ -3,7 +3,7 @@ import json
...
@@ -3,7 +3,7 @@ import json
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.libs.MakeContentConfig
import
DropMode
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.commons
import
join_path
...
...
magic_pdf/user_api.py
View file @
389826c5
...
@@ -16,7 +16,7 @@ import re
...
@@ -16,7 +16,7 @@ import re
from
loguru
import
logger
from
loguru
import
logger
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.model.doc_analyze_by_
pp_structurev2
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_
custom_model
import
doc_analyze
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.rw
import
AbsReaderWriter
from
magic_pdf.pdf_parse_by_ocr_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_ocr_v2
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt_v2
import
parse_pdf_by_txt
from
magic_pdf.pdf_parse_by_txt_v2
import
parse_pdf_by_txt
...
@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
...
@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return
garbage_count
/
total
return
garbage_count
/
total
def
calculate_not_printable_rate
(
text
):
def
calculate_not_printable_rate
(
text
):
printable
=
sum
(
1
for
c
in
text
if
c
.
isprintable
())
printable_text
=
""
for
c
in
text
:
if
c
.
isprintable
():
printable_text
+=
c
printable_total
=
len
(
printable_text
)
total
=
len
(
text
)
total
=
len
(
text
)
if
total
==
0
:
if
total
==
0
:
return
0
# 避免除以零的错误
return
0
# 避免除以零的错误
return
(
total
-
printable
)
/
total
return
(
total
-
printable
_total
)
/
total
not_common_character_rate
=
calculate_not_common_character_rate
(
text_all
)
not_common_character_rate
=
calculate_not_common_character_rate
(
text_all
)
not_printable_rate
=
calculate_not_printable_rate
(
text_all
)
not_printable_rate
=
calculate_not_printable_rate
(
text_all
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment