Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d1c9c7dd
Unverified
Commit
d1c9c7dd
authored
Oct 10, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 10, 2024
Browse files
Merge branch 'opendatalab:dev' into dev
parents
7f9d80fc
ea7bc620
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
145 additions
and
20 deletions
+145
-20
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+33
-17
magic_pdf/para/para_split_v3.py
magic_pdf/para/para_split_v3.py
+110
-0
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+2
-3
No files found.
magic_pdf/model/doc_analyze_by_custom_model.py
View file @
d1c9c7dd
...
...
@@ -4,6 +4,7 @@ import fitz
import
numpy
as
np
from
loguru
import
logger
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.config_reader
import
get_local_models_dir
,
get_device
,
get_table_recog_config
from
magic_pdf.model.model_list
import
MODEL
import
magic_pdf.model
as
model_config
...
...
@@ -23,7 +24,7 @@ def remove_duplicates_dicts(lst):
return
unique_dicts
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
)
->
list
:
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
,
start_page_id
=
0
,
end_page_id
=
None
)
->
list
:
try
:
from
PIL
import
Image
except
ImportError
:
...
...
@@ -32,18 +33,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
images
=
[]
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
"end_page_id is out of range, use images length"
)
end_page_id
=
pdf_page_num
-
1
for
index
in
range
(
0
,
doc
.
page_count
):
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
if
start_page_id
<=
index
<=
end_page_id
:
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 9000 after scaling, do not scale further.
if
pm
.
width
>
9000
or
pm
.
height
>
9000
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# If the width or height exceeds 9000 after scaling, do not scale further.
if
pm
.
width
>
9000
or
pm
.
height
>
9000
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
img
=
Image
.
frombytes
(
"RGB"
,
(
pm
.
width
,
pm
.
height
),
pm
.
samples
)
img
=
np
.
array
(
img
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
else
:
img_dict
=
{
"img"
:
[],
"width"
:
0
,
"height"
:
0
}
img
=
Image
.
frombytes
(
"RGB"
,
(
pm
.
width
,
pm
.
height
),
pm
.
samples
)
img
=
np
.
array
(
img
)
img_dict
=
{
"img"
:
img
,
"width"
:
pm
.
width
,
"height"
:
pm
.
height
}
images
.
append
(
img_dict
)
return
images
...
...
@@ -111,14 +122,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
model_manager
=
ModelSingleton
()
custom_model
=
model_manager
.
get_model
(
ocr
,
show_log
,
lang
)
images
=
load_images_from_pdf
(
pdf_bytes
)
# end_page_id = end_page_id if end_page_id else len(images) - 1
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
images
)
-
1
with
fitz
.
open
(
"pdf"
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
"end_page_id is out of range, use images length"
)
end_page_id
=
pdf_page_num
-
1
if
end_page_id
>
len
(
images
)
-
1
:
logger
.
warning
(
"end_page_id is out of range, use images length"
)
end_page_id
=
len
(
images
)
-
1
images
=
load_images_from_pdf
(
pdf_bytes
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
)
model_json
=
[]
doc_analyze_start
=
time
.
time
()
...
...
@@ -135,6 +146,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
page_dict
=
{
"layout_dets"
:
result
,
"page_info"
:
page_info
}
model_json
.
append
(
page_dict
)
gc_start
=
time
.
time
()
clean_memory
()
gc_time
=
round
(
time
.
time
()
-
gc_start
,
2
)
logger
.
info
(
f
"gc time:
{
gc_time
}
"
)
doc_analyze_time
=
round
(
time
.
time
()
-
doc_analyze_start
,
2
)
doc_analyze_speed
=
round
(
(
end_page_id
+
1
-
start_page_id
)
/
doc_analyze_time
,
2
)
logger
.
info
(
f
"doc analyze time:
{
round
(
time
.
time
()
-
doc_analyze_start
,
2
)
}
,"
...
...
magic_pdf/para/para_split_v3.py
0 → 100644
View file @
d1c9c7dd
import
copy
from
magic_pdf.libs.Constants
import
LINES_DELETED
,
CROSS_PAGE
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
)
def
__process_blocks
(
blocks
):
result
=
[]
current_group
=
[]
for
i
in
range
(
len
(
blocks
)):
current_block
=
blocks
[
i
]
# 如果当前块是 text 类型
if
current_block
[
'type'
]
==
'text'
:
current_block
[
"bbox_fs"
]
=
copy
.
deepcopy
(
current_block
[
"bbox"
])
if
len
(
current_block
[
"lines"
])
>
0
:
current_block
[
'bbox_fs'
]
=
[
min
([
line
[
'bbox'
][
0
]
for
line
in
current_block
[
'lines'
]]),
min
([
line
[
'bbox'
][
1
]
for
line
in
current_block
[
'lines'
]]),
max
([
line
[
'bbox'
][
2
]
for
line
in
current_block
[
'lines'
]]),
max
([
line
[
'bbox'
][
3
]
for
line
in
current_block
[
'lines'
]])]
current_group
.
append
(
current_block
)
# 检查下一个块是否存在
if
i
+
1
<
len
(
blocks
):
next_block
=
blocks
[
i
+
1
]
# 如果下一个块不是 text 类型且是 title 或 interline_equation 类型
if
next_block
[
'type'
]
in
[
'title'
,
'interline_equation'
]:
result
.
append
(
current_group
)
current_group
=
[]
# 处理最后一个 group
if
current_group
:
result
.
append
(
current_group
)
return
result
def
__merge_2_blocks
(
block1
,
block2
):
if
len
(
block1
[
'lines'
])
>
0
:
first_line
=
block1
[
'lines'
][
0
]
line_height
=
first_line
[
'bbox'
][
3
]
-
first_line
[
'bbox'
][
1
]
if
abs
(
block1
[
'bbox_fs'
][
0
]
-
first_line
[
'bbox'
][
0
])
<
line_height
/
2
:
last_line
=
block2
[
'lines'
][
-
1
]
if
len
(
last_line
[
'spans'
])
>
0
:
last_span
=
last_line
[
'spans'
][
-
1
]
line_height
=
last_line
[
'bbox'
][
3
]
-
last_line
[
'bbox'
][
1
]
if
abs
(
block2
[
'bbox_fs'
][
2
]
-
last_line
[
'bbox'
][
2
])
<
line_height
and
not
last_span
[
'content'
].
endswith
(
LINE_STOP_FLAG
):
if
block1
[
'page_num'
]
!=
block2
[
'page_num'
]:
for
line
in
block1
[
'lines'
]:
for
span
in
line
[
'spans'
]:
span
[
CROSS_PAGE
]
=
True
block2
[
'lines'
].
extend
(
block1
[
'lines'
])
block1
[
'lines'
]
=
[]
block1
[
LINES_DELETED
]
=
True
return
block1
,
block2
def
__para_merge_page
(
blocks
):
page_text_blocks_groups
=
__process_blocks
(
blocks
)
for
text_blocks_group
in
page_text_blocks_groups
:
if
len
(
text_blocks_group
)
>
1
:
# 倒序遍历
for
i
in
range
(
len
(
text_blocks_group
)
-
1
,
-
1
,
-
1
):
current_block
=
text_blocks_group
[
i
]
# 检查是否有前一个块
if
i
-
1
>=
0
:
prev_block
=
text_blocks_group
[
i
-
1
]
__merge_2_blocks
(
current_block
,
prev_block
)
else
:
continue
def
para_split
(
pdf_info_dict
,
debug_mode
=
False
):
all_blocks
=
[]
for
page_num
,
page
in
pdf_info_dict
.
items
():
blocks
=
copy
.
deepcopy
(
page
[
'preproc_blocks'
])
for
block
in
blocks
:
block
[
'page_num'
]
=
page_num
all_blocks
.
extend
(
blocks
)
__para_merge_page
(
all_blocks
)
for
page_num
,
page
in
pdf_info_dict
.
items
():
page
[
'para_blocks'
]
=
[]
for
block
in
all_blocks
:
if
block
[
'page_num'
]
==
page_num
:
page
[
'para_blocks'
].
append
(
block
)
if
__name__
==
'__main__'
:
input_blocks
=
[
{
'type'
:
'text'
,
'content'
:
'这是第一段'
},
{
'type'
:
'text'
,
'content'
:
'这是第二段'
},
{
'type'
:
'title'
,
'content'
:
'这是一个标题'
},
{
'type'
:
'text'
,
'content'
:
'这是第三段'
},
{
'type'
:
'interline_equation'
,
'content'
:
'这是一个公式'
},
{
'type'
:
'text'
,
'content'
:
'这是第四段'
},
{
'type'
:
'image'
,
'content'
:
'这是一张图片'
},
{
'type'
:
'text'
,
'content'
:
'这是第五段'
},
{
'type'
:
'table'
,
'content'
:
'这是一张表格'
}
]
# 调用函数
for
group_index
,
group
in
enumerate
(
__process_blocks
(
input_blocks
)):
print
(
f
"Group
{
group_index
}
:
{
group
}
"
)
magic_pdf/pdf_parse_union_core_v2.py
View file @
d1c9c7dd
...
...
@@ -17,6 +17,7 @@ from magic_pdf.libs.hash_utils import compute_md5
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
...
...
@@ -435,9 +436,7 @@ def pdf_parse_union(pdf_bytes,
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
"""分段"""
# para_split(pdf_info_dict, debug_mode=debug_mode)
for
page_num
,
page
in
pdf_info_dict
.
items
():
page
[
'para_blocks'
]
=
page
[
'preproc_blocks'
]
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment