Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
7d2dfc80
Commit
7d2dfc80
authored
Oct 28, 2024
by
liukaiwen
Browse files
Merge branch 'dev' into dev-table-model-update
parents
a0eff3be
6d571e2e
Changes
137
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
518 additions
and
232 deletions
+518
-232
magic_pdf/pdf_parse_by_ocr.py
magic_pdf/pdf_parse_by_ocr.py
+5
-2
magic_pdf/pdf_parse_by_txt.py
magic_pdf/pdf_parse_by_txt.py
+5
-2
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+300
-156
magic_pdf/pipe/AbsPipe.py
magic_pdf/pipe/AbsPipe.py
+6
-7
magic_pdf/pipe/OCRPipe.py
magic_pdf/pipe/OCRPipe.py
+8
-4
magic_pdf/pipe/TXTPipe.py
magic_pdf/pipe/TXTPipe.py
+8
-4
magic_pdf/pipe/UNIPipe.py
magic_pdf/pipe/UNIPipe.py
+10
-5
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+55
-26
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+27
-1
magic_pdf/resources/model_config/model_configs.yaml
magic_pdf/resources/model_config/model_configs.yaml
+5
-13
magic_pdf/tools/cli.py
magic_pdf/tools/cli.py
+1
-1
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+11
-6
magic_pdf/user_api.py
magic_pdf/user_api.py
+13
-5
magic_pdf/utils/__init__.py
magic_pdf/utils/__init__.py
+0
-0
magic_pdf/utils/annotations.py
magic_pdf/utils/annotations.py
+11
-0
next_docs/en/.readthedocs.yaml
next_docs/en/.readthedocs.yaml
+0
-0
next_docs/en/Makefile
next_docs/en/Makefile
+0
-0
next_docs/en/_static/image/logo.png
next_docs/en/_static/image/logo.png
+0
-0
next_docs/en/api.rst
next_docs/en/api.rst
+9
-0
next_docs/en/api/data_reader_writer.rst
next_docs/en/api/data_reader_writer.rst
+44
-0
No files found.
magic_pdf/pdf_parse_by_ocr.py
View file @
7d2dfc80
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
...
...
@@ -8,10 +10,11 @@ def parse_pdf_by_ocr(pdf_bytes,
end_page_id
=
None
,
debug_mode
=
False
,
):
return
pdf_parse_union
(
pdf_bytes
,
dataset
=
PymuDocDataset
(
pdf_bytes
)
return
pdf_parse_union
(
dataset
,
model_list
,
imageWriter
,
"ocr"
,
SupportedPdfParseMethod
.
OCR
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
debug_mode
=
debug_mode
,
...
...
magic_pdf/pdf_parse_by_txt.py
View file @
7d2dfc80
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
...
...
@@ -9,10 +11,11 @@ def parse_pdf_by_txt(
end_page_id
=
None
,
debug_mode
=
False
,
):
return
pdf_parse_union
(
pdf_bytes
,
dataset
=
PymuDocDataset
(
pdf_bytes
)
return
pdf_parse_union
(
dataset
,
model_list
,
imageWriter
,
"txt"
,
SupportedPdfParseMethod
.
TXT
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
debug_mode
=
debug_mode
,
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
7d2dfc80
import
copy
import
os
import
statistics
import
time
from
loguru
import
logger
from
typing
import
List
import
torch
from
loguru
import
logger
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.dataset
import
Dataset
,
PageableData
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.commons
import
fitz
,
get_delta_time
from
magic_pdf.libs.config_reader
import
get_local_layoutreader_model_dir
...
...
@@ -15,31 +16,39 @@ from magic_pdf.libs.convert_utils import dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.construct_page_dict
import
\
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.equations_replace
import
remove_chars_in_text_blocks
,
replace_equations_in_textblock
,
\
combine_chars_to_pymudict
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
remove_overlaps_min_spans
,
get_qa_need_list_v2
,
\
remove_overlaps_low_confidence_spans
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
check_useful_block_horizontal_overlap
from
magic_pdf.pre_proc.equations_replace
import
(
combine_chars_to_pymudict
,
remove_chars_in_text_blocks
,
replace_equations_in_textblock
)
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
\
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
fill_spans_in_blocks
,
fix_block_spans
,
fix_discarded_block
,
fix_block_spans_v2
)
from
magic_pdf.pre_proc.ocr_span_list_modify
import
(
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
remove_overlaps_min_spans
)
from
magic_pdf.pre_proc.resolve_bbox_conflict
import
\
check_useful_block_horizontal_overlap
def
remove_horizontal_overlap_block_which_smaller
(
all_bboxes
):
useful_blocks
=
[]
for
bbox
in
all_bboxes
:
useful_blocks
.
append
({
"bbox"
:
bbox
[:
4
]
}
)
is_useful_block_horz_overlap
,
smaller_bbox
,
bigger_bbox
=
check_useful_block_horizontal_overlap
(
useful_blocks
)
useful_blocks
.
append
({
'bbox'
:
bbox
[:
4
]})
is_useful_block_horz_overlap
,
smaller_bbox
,
bigger_bbox
=
(
check_useful_block_horizontal_overlap
(
useful_blocks
)
)
if
is_useful_block_horz_overlap
:
logger
.
warning
(
f
"skip this page, reason:
{
DropReason
.
USEFUL_BLOCK_HOR_OVERLAP
}
, smaller bbox is
{
smaller_bbox
}
, bigger bbox is
{
bigger_bbox
}
"
)
f
'skip this page, reason:
{
DropReason
.
USEFUL_BLOCK_HOR_OVERLAP
}
, smaller bbox is
{
smaller_bbox
}
, bigger bbox is
{
bigger_bbox
}
'
)
# noqa: E501
for
bbox
in
all_bboxes
.
copy
():
if
smaller_bbox
==
bbox
[:
4
]:
all_bboxes
.
remove
(
bbox
)
...
...
@@ -47,27 +56,27 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
return
is_useful_block_horz_overlap
,
all_bboxes
def
__replace_STX_ETX
(
text_str
:
str
):
"""
Replace
\u0002
and
\u0003
, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
def
__replace_STX_ETX
(
text_str
:
str
):
"""Replace
\u0002
and
\u0003
, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
Args:
text_str (str): raw text
Args:
text_str (str): raw text
Returns:
_type_: replaced text
"""
Returns:
_type_: replaced text
"""
# noqa: E501
if
text_str
:
s
=
text_str
.
replace
(
'
\u0002
'
,
"'"
)
s
=
s
.
replace
(
"
\u0003
"
,
"'"
)
s
=
s
.
replace
(
'
\u0003
'
,
"'"
)
return
s
return
text_str
def
txt_spans_extract
(
pdf_page
,
inline_equations
,
interline_equations
):
text_raw_blocks
=
pdf_page
.
get_text
(
"
dict
"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"
blocks
"
]
char_level_text_blocks
=
pdf_page
.
get_text
(
"
rawdict
"
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
"
blocks
"
text_raw_blocks
=
pdf_page
.
get_text
(
'
dict
'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'
blocks
'
]
char_level_text_blocks
=
pdf_page
.
get_text
(
'
rawdict
'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'
blocks
'
]
text_blocks
=
combine_chars_to_pymudict
(
text_raw_blocks
,
char_level_text_blocks
)
text_blocks
=
replace_equations_in_textblock
(
...
...
@@ -77,54 +86,63 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_blocks
=
remove_chars_in_text_blocks
(
text_blocks
)
spans
=
[]
for
v
in
text_blocks
:
for
line
in
v
[
"
lines
"
]:
for
span
in
line
[
"
spans
"
]:
bbox
=
span
[
"
bbox
"
]
for
line
in
v
[
'
lines
'
]:
for
span
in
line
[
'
spans
'
]:
bbox
=
span
[
'
bbox
'
]
if
float_equal
(
bbox
[
0
],
bbox
[
2
])
or
float_equal
(
bbox
[
1
],
bbox
[
3
]):
continue
if
span
.
get
(
'type'
)
not
in
(
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
):
if
span
.
get
(
'type'
)
not
in
(
ContentType
.
InlineEquation
,
ContentType
.
InterlineEquation
,
):
spans
.
append
(
{
"
bbox
"
:
list
(
span
[
"
bbox
"
]),
"
content
"
:
__replace_STX_ETX
(
span
[
"
text
"
]),
"
type
"
:
ContentType
.
Text
,
"
score
"
:
1.0
,
'
bbox
'
:
list
(
span
[
'
bbox
'
]),
'
content
'
:
__replace_STX_ETX
(
span
[
'
text
'
]),
'
type
'
:
ContentType
.
Text
,
'
score
'
:
1.0
,
}
)
return
spans
def
replace_text_span
(
pymu_spans
,
ocr_spans
):
return
list
(
filter
(
lambda
x
:
x
[
"
type
"
]
!=
ContentType
.
Text
,
ocr_spans
))
+
pymu_spans
return
list
(
filter
(
lambda
x
:
x
[
'
type
'
]
!=
ContentType
.
Text
,
ocr_spans
))
+
pymu_spans
def
model_init
(
model_name
:
str
):
from
transformers
import
LayoutLMv3ForTokenClassification
if
torch
.
cuda
.
is_available
():
device
=
torch
.
device
(
"
cuda
"
)
device
=
torch
.
device
(
'
cuda
'
)
if
torch
.
cuda
.
is_bf16_supported
():
supports_bfloat16
=
True
else
:
supports_bfloat16
=
False
else
:
device
=
torch
.
device
(
"
cpu
"
)
device
=
torch
.
device
(
'
cpu
'
)
supports_bfloat16
=
False
if
model_name
==
"
layoutreader
"
:
if
model_name
==
'
layoutreader
'
:
# 检测modelscope的缓存目录是否存在
layoutreader_model_dir
=
get_local_layoutreader_model_dir
()
if
os
.
path
.
exists
(
layoutreader_model_dir
):
model
=
LayoutLMv3ForTokenClassification
.
from_pretrained
(
layoutreader_model_dir
)
model
=
LayoutLMv3ForTokenClassification
.
from_pretrained
(
layoutreader_model_dir
)
else
:
logger
.
warning
(
f
"local layoutreader model not exists, use online model from huggingface"
)
model
=
LayoutLMv3ForTokenClassification
.
from_pretrained
(
"hantian/layoutreader"
)
'local layoutreader model not exists, use online model from huggingface'
)
model
=
LayoutLMv3ForTokenClassification
.
from_pretrained
(
'hantian/layoutreader'
)
# 检查设备是否支持 bfloat16
if
supports_bfloat16
:
model
.
bfloat16
()
model
.
to
(
device
).
eval
()
else
:
logger
.
error
(
"
model name not allow
"
)
logger
.
error
(
'
model name not allow
'
)
exit
(
1
)
return
model
...
...
@@ -145,7 +163,9 @@ class ModelSingleton:
def
do_predict
(
boxes
:
List
[
List
[
int
]],
model
)
->
List
[
int
]:
from
magic_pdf.model.v3.helpers
import
prepare_inputs
,
boxes2inputs
,
parse_logits
from
magic_pdf.model.v3.helpers
import
(
boxes2inputs
,
parse_logits
,
prepare_inputs
)
inputs
=
boxes2inputs
(
boxes
)
inputs
=
prepare_inputs
(
inputs
,
model
)
logits
=
model
(
**
inputs
).
logits
.
cpu
().
squeeze
(
0
)
...
...
@@ -154,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
def
cal_block_index
(
fix_blocks
,
sorted_bboxes
):
for
block
in
fix_blocks
:
# if block['type'] in ['text', 'title', 'interline_equation']:
# line_index_list = []
# if len(block['lines']) == 0:
# block['index'] = sorted_bboxes.index(block['bbox'])
# else:
# for line in block['lines']:
# line['index'] = sorted_bboxes.index(line['bbox'])
# line_index_list.append(line['index'])
# median_value = statistics.median(line_index_list)
# block['index'] = median_value
#
# elif block['type'] in ['table', 'image']:
# block['index'] = sorted_bboxes.index(block['bbox'])
line_index_list
=
[]
if
len
(
block
[
'lines'
])
==
0
:
...
...
@@ -178,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
median_value
=
statistics
.
median
(
line_index_list
)
block
[
'index'
]
=
median_value
# 删除图表block中的虚拟line信息
if
block
[
'type'
]
in
[
'table'
,
'image'
]:
del
block
[
'lines'
]
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
block
[
'virtual_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'lines'
]
=
copy
.
deepcopy
(
block
[
'real_lines'
])
del
block
[
'real_lines'
]
return
fix_blocks
...
...
@@ -193,21 +202,22 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
block_weight
=
x1
-
x0
# 如果block高度小于n行正文,则直接返回block的bbox
if
line_height
*
3
<
block_height
:
if
block_height
>
page_h
*
0.25
and
page_w
*
0.5
>
block_weight
>
page_w
*
0.25
:
# 可能是双列结构,可以切细点
lines
=
int
(
block_height
/
line_height
)
+
1
if
line_height
*
3
<
block_height
:
if
(
block_height
>
page_h
*
0.25
and
page_w
*
0.5
>
block_weight
>
page_w
*
0.25
):
# 可能是双列结构,可以切细点
lines
=
int
(
block_height
/
line_height
)
+
1
else
:
# 如果block的宽度超过0.4页面宽度,则将block分成3行
if
block_weight
>
page_w
*
0.4
:
# 如果block的宽度超过0.4页面宽度,则将block分成3行
(是一种复杂布局,图不能切的太细)
if
block_weight
>
page_w
*
0.4
:
line_height
=
(
y1
-
y0
)
/
3
lines
=
3
elif
block_weight
>
page_w
*
0.25
:
# 否则将block分成两行
line_height
=
(
y1
-
y0
)
/
2
lines
=
2
else
:
# 判断长宽比
if
block_height
/
block_weight
>
1.2
:
# 细长的不分
elif
block_weight
>
page_w
*
0.25
:
# (可能是三列结构,也切细点)
lines
=
int
(
block_height
/
line_height
)
+
1
else
:
# 判断长宽比
if
block_height
/
block_weight
>
1.2
:
# 细长的不分
return
[[
x0
,
y0
,
x1
,
y1
]]
else
:
# 不细长的还是分成两行
else
:
# 不细长的还是分成两行
line_height
=
(
y1
-
y0
)
/
2
lines
=
2
...
...
@@ -229,7 +239,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
page_line_list
=
[]
for
block
in
fix_blocks
:
if
block
[
'type'
]
in
[
'text'
,
'title'
,
'interline_equation'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
if
len
(
block
[
'lines'
])
==
0
:
bbox
=
block
[
'bbox'
]
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
...
...
@@ -240,8 +254,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
page_line_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
'table'
,
'image'
]:
elif
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
bbox
=
block
[
'bbox'
]
block
[
"real_lines"
]
=
copy
.
deepcopy
(
block
[
'lines'
])
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
block
[
'lines'
]
=
[]
for
line
in
lines
:
...
...
@@ -256,19 +271,23 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
for
left
,
top
,
right
,
bottom
in
page_line_list
:
if
left
<
0
:
logger
.
warning
(
f
"left < 0, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
"
)
f
'left < 0, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
left
=
0
if
right
>
page_w
:
logger
.
warning
(
f
"right > page_w, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
"
)
f
'right > page_w, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
right
=
page_w
if
top
<
0
:
logger
.
warning
(
f
"top < 0, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
"
)
f
'top < 0, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
top
=
0
if
bottom
>
page_h
:
logger
.
warning
(
f
"bottom > page_h, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
"
)
f
'bottom > page_h, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
bottom
=
page_h
left
=
round
(
left
*
x_scale
)
...
...
@@ -276,11 +295,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
right
=
round
(
right
*
x_scale
)
bottom
=
round
(
bottom
*
y_scale
)
assert
(
1000
>=
right
>=
left
>=
0
and
1000
>=
bottom
>=
top
>=
0
),
f
"
Invalid box. right:
{
right
}
, left:
{
left
}
, bottom:
{
bottom
}
, top:
{
top
}
"
1000
>=
right
>=
left
>=
0
and
1000
>=
bottom
>=
top
>=
0
),
f
'
Invalid box. right:
{
right
}
, left:
{
left
}
, bottom:
{
bottom
}
, top:
{
top
}
'
# noqa: E126, E121
boxes
.
append
([
left
,
top
,
right
,
bottom
])
model_manager
=
ModelSingleton
()
model
=
model_manager
.
get_model
(
"
layoutreader
"
)
model
=
model_manager
.
get_model
(
'
layoutreader
'
)
with
torch
.
no_grad
():
orders
=
do_predict
(
boxes
,
model
)
sorted_bboxes
=
[
page_line_list
[
i
]
for
i
in
orders
]
...
...
@@ -291,149 +310,274 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
def
get_line_height
(
blocks
):
page_line_height_list
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
in
[
'text'
,
'title'
,
'interline_equation'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
if
len
(
page_line_height_list
)
>
0
:
return
statistics
.
median
(
page_line_height_list
)
else
:
return
10
def
parse_page_core
(
pdf_docs
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
):
def
process_groups
(
groups
,
body_key
,
caption_key
,
footnote_key
):
body_blocks
=
[]
caption_blocks
=
[]
footnote_blocks
=
[]
for
i
,
group
in
enumerate
(
groups
):
group
[
body_key
][
'group_id'
]
=
i
body_blocks
.
append
(
group
[
body_key
])
for
caption_block
in
group
[
caption_key
]:
caption_block
[
'group_id'
]
=
i
caption_blocks
.
append
(
caption_block
)
for
footnote_block
in
group
[
footnote_key
]:
footnote_block
[
'group_id'
]
=
i
footnote_blocks
.
append
(
footnote_block
)
return
body_blocks
,
caption_blocks
,
footnote_blocks
def
process_block_list
(
blocks
,
body_type
,
block_type
):
indices
=
[
block
[
'index'
]
for
block
in
blocks
]
median_index
=
statistics
.
median
(
indices
)
body_bbox
=
next
((
block
[
'bbox'
]
for
block
in
blocks
if
block
.
get
(
'type'
)
==
body_type
),
[])
return
{
'type'
:
block_type
,
'bbox'
:
body_bbox
,
'blocks'
:
blocks
,
'index'
:
median_index
,
}
def
revert_group_blocks
(
blocks
):
image_groups
=
{}
table_groups
=
{}
new_blocks
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
image_groups
:
image_groups
[
group_id
]
=
[]
image_groups
[
group_id
].
append
(
block
)
elif
block
[
'type'
]
in
[
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
table_groups
:
table_groups
[
group_id
]
=
[]
table_groups
[
group_id
].
append
(
block
)
else
:
new_blocks
.
append
(
block
)
for
group_id
,
blocks
in
image_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
ImageBody
,
BlockType
.
Image
))
for
group_id
,
blocks
in
table_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
TableBody
,
BlockType
.
Table
))
return
new_blocks
def
parse_page_core
(
page_doc
:
PageableData
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
):
need_drop
=
False
drop_reason
=
[]
'''从magic_model对象中获取后面会用到的区块信息'''
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
"""从magic_model对象中获取后面会用到的区块信息"""
# img_blocks = magic_model.get_imgs(page_id)
# table_blocks = magic_model.get_tables(page_id)
img_groups
=
magic_model
.
get_imgs_v2
(
page_id
)
table_groups
=
magic_model
.
get_tables_v2
(
page_id
)
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
=
process_groups
(
img_groups
,
'image_body'
,
'image_caption_list'
,
'image_footnote_list'
)
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
=
process_groups
(
table_groups
,
'table_body'
,
'table_caption_list'
,
'table_footnote_list'
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
(
magic_model
.
get_equations
(
page_id
)
)
page_w
,
page_h
=
magic_model
.
get_page_size
(
page_id
)
spans
=
magic_model
.
get_all_spans
(
page_id
)
'''
根据parse_mode,构造spans
'''
if
parse_mode
==
"txt"
:
"""
根据parse_mode,构造spans
"""
if
parse_mode
==
SupportedPdfParseMethod
.
TXT
:
"""ocr 中文本类的 span 用 pymu spans 替换!"""
pymu_spans
=
txt_spans_extract
(
pdf_docs
[
page_id
],
inline_equations
,
interline_equations
)
pymu_spans
=
txt_spans_extract
(
page_doc
,
inline_equations
,
interline_equations
)
spans
=
replace_text_span
(
pymu_spans
,
spans
)
elif
parse_mode
==
"ocr"
:
elif
parse_mode
==
SupportedPdfParseMethod
.
OCR
:
pass
else
:
raise
Exception
(
"
parse_mode must be txt or ocr
"
)
raise
Exception
(
'
parse_mode must be txt or ocr
'
)
'''
删除重叠spans中置信度较低的那些
'''
"""
删除重叠spans中置信度较低的那些
"""
spans
,
dropped_spans_by_confidence
=
remove_overlaps_low_confidence_spans
(
spans
)
'''
删除重叠spans中较小的那些
'''
"""
删除重叠spans中较小的那些
"""
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
'''对image和table截图'''
spans
=
ocr_cut_image_and_table
(
spans
,
pdf_docs
[
page_id
],
page_id
,
pdf_bytes_md5
,
imageWriter
)
"""对image和table截图"""
spans
=
ocr_cut_image_and_table
(
spans
,
page_doc
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
'''
将所有区块的bbox整理到一起
'''
"""
将所有区块的bbox整理到一起
"""
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
)
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
,
)
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
)
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
,
)
'''先处理不需要排版的discarded_blocks'''
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
"""先处理不需要排版的discarded_blocks"""
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
fix_discarded_blocks
=
fix_discarded_block
(
discarded_block_with_spans
)
'''
如果当前页面没有bbox则跳过
'''
"""
如果当前页面没有bbox则跳过
"""
if
len
(
all_bboxes
)
==
0
:
logger
.
warning
(
f
"skip this page, not found useful bbox, page_id:
{
page_id
}
"
)
return
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
fix_discarded_blocks
,
need_drop
,
drop_reason
)
logger
.
warning
(
f
'skip this page, not found useful bbox, page_id:
{
page_id
}
'
)
return
ocr_construct_page_component_v2
(
[],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
fix_discarded_blocks
,
need_drop
,
drop_reason
,
)
'''
将span填入blocks中
'''
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.
3
)
"""
将span填入blocks中
"""
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.
5
)
'''
对block进行fix操作
'''
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
"""
对block进行fix操作
"""
fix_blocks
=
fix_block_spans
_v2
(
block_with_spans
)
'''
获取所有line并计算正文line的高度
'''
"""
获取所有line并计算正文line的高度
"""
line_height
=
get_line_height
(
fix_blocks
)
'''
获取所有line并对line排序
'''
"""
获取所有line并对line排序
"""
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
)
'''
根据line的中位数算block的序列关系
'''
"""
根据line的中位数算block的序列关系
"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
'''重排block'''
"""将image和table的block还原回group形式参与后续流程"""
fix_blocks
=
revert_group_blocks
(
fix_blocks
)
"""重排block"""
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
'''
获取QA需要外置的list
'''
"""
获取QA需要外置的list
"""
images
,
tables
,
interline_equations
=
get_qa_need_list_v2
(
sorted_blocks
)
'''构造pdf_info_dict'''
page_info
=
ocr_construct_page_component_v2
(
sorted_blocks
,
[],
page_id
,
page_w
,
page_h
,
[],
images
,
tables
,
interline_equations
,
fix_discarded_blocks
,
need_drop
,
drop_reason
)
"""构造pdf_info_dict"""
page_info
=
ocr_construct_page_component_v2
(
sorted_blocks
,
[],
page_id
,
page_w
,
page_h
,
[],
images
,
tables
,
interline_equations
,
fix_discarded_blocks
,
need_drop
,
drop_reason
,
)
return
page_info
def
pdf_parse_union
(
pdf_bytes
,
model_lis
t
,
imageWriter
,
parse_mode
,
start_page_id
=
0
,
end
_page_id
=
None
,
debug_mode
=
Fals
e
,
):
pdf_bytes_md5
=
compute_md5
(
pdf_bytes
)
pdf_
docs
=
fitz
.
open
(
"pdf"
,
pdf_bytes
)
def
pdf_parse_union
(
dataset
:
Datase
t
,
model_list
,
imageWriter
,
parse_mode
,
start
_page_id
=
0
,
end_page_id
=
Non
e
,
debug_mode
=
False
,
)
:
pdf_
bytes_md5
=
compute_md5
(
dataset
.
data_bits
()
)
'''
初始化空的pdf_info_dict
'''
"""
初始化空的pdf_info_dict
"""
pdf_info_dict
=
{}
'''
用model_list和docs对象初始化magic_model
'''
magic_model
=
MagicModel
(
model_list
,
pdf_docs
)
"""
用model_list和docs对象初始化magic_model
"""
magic_model
=
MagicModel
(
model_list
,
dataset
)
'''
根据输入的起始范围解析pdf
'''
"""
根据输入的起始范围解析pdf
"""
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
end_page_id
=
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
pdf_docs
)
-
1
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
dataset
)
-
1
)
if
end_page_id
>
len
(
pdf_docs
)
-
1
:
logger
.
warning
(
"
end_page_id is out of range, use pdf_docs length
"
)
end_page_id
=
len
(
pdf_docs
)
-
1
if
end_page_id
>
len
(
dataset
)
-
1
:
logger
.
warning
(
'
end_page_id is out of range, use pdf_docs length
'
)
end_page_id
=
len
(
dataset
)
-
1
'''
初始化启动时间
'''
"""
初始化启动时间
"""
start_time
=
time
.
time
()
for
page_id
,
page
in
enumerate
(
pdf_docs
):
'''
debug时输出每页解析的耗时
'''
for
page_id
,
page
in
enumerate
(
dataset
):
"""
debug时输出每页解析的耗时
."""
if
debug_mode
:
time_now
=
time
.
time
()
logger
.
info
(
f
"
page_id:
{
page_id
}
, last_page_cost_time:
{
get_delta_time
(
start_time
)
}
"
f
'
page_id:
{
page_id
}
, last_page_cost_time:
{
get_delta_time
(
start_time
)
}
'
)
start_time
=
time_now
'''
解析pdf中的每一页
'''
"""
解析pdf中的每一页
"""
if
start_page_id
<=
page_id
<=
end_page_id
:
page_info
=
parse_page_core
(
pdf_docs
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
)
page_info
=
parse_page_core
(
page
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
)
else
:
page_w
=
page
.
rect
.
width
page_h
=
page
.
rect
.
height
page_info
=
ocr_construct_page_component_v2
([],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
[],
[],
True
,
"skip page"
)
pdf_info_dict
[
f
"page_
{
page_id
}
"
]
=
page_info
page_info
=
page
.
get_page_info
()
page_w
=
page_info
.
w
page_h
=
page_info
.
h
page_info
=
ocr_construct_page_component_v2
(
[],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
[],
[],
True
,
'skip page'
)
pdf_info_dict
[
f
'page_
{
page_id
}
'
]
=
page_info
"""分段"""
para_split
(
pdf_info_dict
,
debug_mode
=
debug_mode
)
...
...
@@ -441,7 +585,7 @@ def pdf_parse_union(pdf_bytes,
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
new_pdf_info_dict
=
{
"
pdf_info
"
:
pdf_info_list
,
'
pdf_info
'
:
pdf_info_list
,
}
clean_memory
()
...
...
magic_pdf/pipe/AbsPipe.py
View file @
7d2dfc80
...
...
@@ -17,7 +17,7 @@ class AbsPipe(ABC):
PIP_TXT
=
"txt"
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
):
self
.
pdf_bytes
=
pdf_bytes
self
.
model_list
=
model_list
self
.
image_writer
=
image_writer
...
...
@@ -26,6 +26,9 @@ class AbsPipe(ABC):
self
.
start_page_id
=
start_page_id
self
.
end_page_id
=
end_page_id
self
.
lang
=
lang
self
.
layout_model
=
layout_model
self
.
formula_enable
=
formula_enable
self
.
table_enable
=
table_enable
def
get_compress_pdf_mid_data
(
self
):
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
...
...
@@ -95,9 +98,7 @@ class AbsPipe(ABC):
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
parse_type
=
pdf_mid_data
[
"_parse_type"
]
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
img_buket_path
)
return
content_list
@
staticmethod
...
...
@@ -107,9 +108,7 @@ class AbsPipe(ABC):
"""
pdf_mid_data
=
JsonCompressor
.
decompress_json
(
compressed_pdf_mid_data
)
pdf_info_list
=
pdf_mid_data
[
"pdf_info"
]
parse_type
=
pdf_mid_data
[
"_parse_type"
]
lang
=
pdf_mid_data
.
get
(
"_lang"
,
None
)
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
,
parse_type
,
lang
)
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_buket_path
)
return
md_content
magic_pdf/pipe/OCRPipe.py
View file @
7d2dfc80
...
...
@@ -10,8 +10,10 @@ from magic_pdf.user_api import parse_ocr_pdf
class
OCRPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
):
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
def
pipe_classify
(
self
):
pass
...
...
@@ -19,12 +21,14 @@ class OCRPipe(AbsPipe):
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/pipe/TXTPipe.py
View file @
7d2dfc80
...
...
@@ -11,8 +11,10 @@ from magic_pdf.user_api import parse_txt_pdf
class
TXTPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
model_list
:
list
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
):
super
().
__init__
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
def
pipe_classify
(
self
):
pass
...
...
@@ -20,12 +22,14 @@ class TXTPipe(AbsPipe):
def
pipe_analyze
(
self
):
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
def
pipe_parse
(
self
):
self
.
pdf_mid_data
=
parse_txt_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
def
pipe_mk_uni_format
(
self
,
img_parent_path
:
str
,
drop_mode
=
DropMode
.
WHOLE_PDF
):
result
=
super
().
pipe_mk_uni_format
(
img_parent_path
,
drop_mode
)
...
...
magic_pdf/pipe/UNIPipe.py
View file @
7d2dfc80
...
...
@@ -14,9 +14,11 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
class
UNIPipe
(
AbsPipe
):
def
__init__
(
self
,
pdf_bytes
:
bytes
,
jso_useful_key
:
dict
,
image_writer
:
AbsReaderWriter
,
is_debug
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
):
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
):
self
.
pdf_type
=
jso_useful_key
[
"_pdf_type"
]
super
().
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
)
super
().
__init__
(
pdf_bytes
,
jso_useful_key
[
"model_list"
],
image_writer
,
is_debug
,
start_page_id
,
end_page_id
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
if
len
(
self
.
model_list
)
==
0
:
self
.
input_model_is_empty
=
True
else
:
...
...
@@ -29,18 +31,21 @@ class UNIPipe(AbsPipe):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
False
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
model_list
=
doc_analyze
(
self
.
pdf_bytes
,
ocr
=
True
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
def
pipe_parse
(
self
):
if
self
.
pdf_type
==
self
.
PIP_TXT
:
self
.
pdf_mid_data
=
parse_union_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
input_model_is_empty
=
self
.
input_model_is_empty
,
start_page_id
=
self
.
start_page_id
,
end_page_id
=
self
.
end_page_id
,
lang
=
self
.
lang
)
lang
=
self
.
lang
,
layout_model
=
self
.
layout_model
,
formula_enable
=
self
.
formula_enable
,
table_enable
=
self
.
table_enable
)
elif
self
.
pdf_type
==
self
.
PIP_OCR
:
self
.
pdf_mid_data
=
parse_ocr_pdf
(
self
.
pdf_bytes
,
self
.
model_list
,
self
.
image_writer
,
is_debug
=
self
.
is_debug
,
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
7d2dfc80
from
loguru
import
logger
from
magic_pdf.libs.boxbase
import
get_minbox_if_overlap_by_ratio
,
calculate_overlap_area_in_bbox1_area_ratio
,
\
calculate_iou
calculate_iou
,
calculate_vertical_projection_overlap_ratio
from
magic_pdf.libs.drop_tag
import
DropTag
from
magic_pdf.libs.ocr_content_type
import
BlockType
from
magic_pdf.pre_proc.remove_bbox_overlap
import
remove_overlap_between_bbox_for_block
...
...
@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
return
all_bboxes
,
all_discarded_blocks
,
drop_reasons
def
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
,
image
[
"score"
]])
for
table
in
table_blocks
:
x0
,
y0
,
x1
,
y1
=
table
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Table
,
None
,
None
,
None
,
None
,
table
[
"score"
]])
def
add_bboxes
(
blocks
,
block_type
,
bboxes
):
for
block
in
blocks
:
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
if
block_type
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
"score"
],
block
[
"group_id"
]])
else
:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
"score"
]])
for
text
in
text_blocks
:
x0
,
y0
,
x1
,
y1
=
text
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Text
,
None
,
None
,
None
,
None
,
text
[
"score"
]])
for
title
in
title_blocks
:
x0
,
y0
,
x1
,
y1
=
title
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Title
,
None
,
None
,
None
,
None
,
title
[
"score"
]])
def
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
for
interline_equation
in
interline_equation_blocks
:
x0
,
y0
,
x1
,
y1
=
interline_equation
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
InterlineEquation
,
None
,
None
,
None
,
None
,
interline_equation
[
"score"
]])
add_bboxes
(
img_body_blocks
,
BlockType
.
ImageBody
,
all_bboxes
)
add_bboxes
(
img_caption_blocks
,
BlockType
.
ImageCaption
,
all_bboxes
)
add_bboxes
(
img_footnote_blocks
,
BlockType
.
ImageFootnote
,
all_bboxes
)
add_bboxes
(
table_body_blocks
,
BlockType
.
TableBody
,
all_bboxes
)
add_bboxes
(
table_caption_blocks
,
BlockType
.
TableCaption
,
all_bboxes
)
add_bboxes
(
table_footnote_blocks
,
BlockType
.
TableFootnote
,
all_bboxes
)
add_bboxes
(
text_blocks
,
BlockType
.
Text
,
all_bboxes
)
add_bboxes
(
title_blocks
,
BlockType
.
Title
,
all_bboxes
)
add_bboxes
(
interline_equation_blocks
,
BlockType
.
InterlineEquation
,
all_bboxes
)
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
...
...
@@ -96,13 +101,23 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
'''discarded_blocks'''
all_discarded_blocks
=
[]
add_bboxes
(
discarded_blocks
,
BlockType
.
Discarded
,
all_discarded_blocks
)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
footnote_blocks
=
[]
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
,
discarded
[
"score"
]])
# 将footnote加入到all_bboxes中,用来计算layout
# if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
# all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
footnote_blocks
.
append
([
x0
,
y0
,
x1
,
y1
])
'''移除在footnote下面的任何框'''
need_remove_blocks
=
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
)
if
len
(
need_remove_blocks
)
>
0
:
for
block
in
need_remove_blocks
:
all_bboxes
.
remove
(
block
)
all_discarded_blocks
.
append
(
block
)
'''经过以上处理后,还存在大框套小框的情况,则删除小框'''
all_bboxes
=
remove_overlaps_min_blocks
(
all_bboxes
)
...
...
@@ -113,6 +128,20 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
return
all_bboxes
,
all_discarded_blocks
def
find_blocks_under_footnote
(
all_bboxes
,
footnote_blocks
):
need_remove_blocks
=
[]
for
block
in
all_bboxes
:
block_x0
,
block_y0
,
block_x1
,
block_y1
=
block
[:
4
]
for
footnote_bbox
in
footnote_blocks
:
footnote_x0
,
footnote_y0
,
footnote_x1
,
footnote_y1
=
footnote_bbox
# 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1
if
block_y0
>=
footnote_y1
and
calculate_vertical_projection_overlap_ratio
((
block_x0
,
block_y0
,
block_x1
,
block_y1
),
footnote_bbox
)
>=
0.8
:
if
block
not
in
need_remove_blocks
:
need_remove_blocks
.
append
(
block
)
break
return
need_remove_blocks
def
fix_interline_equation_overlap_text_blocks_with_hi_iou
(
all_bboxes
):
# 先提取所有text和interline block
text_blocks
=
[]
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
7d2dfc80
...
...
@@ -49,7 +49,7 @@ def merge_spans_to_line(spans):
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
0.
6
):
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
0.
5
):
current_line
.
append
(
span
)
else
:
# 否则,开始新行
...
...
@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
'type'
:
block_type
,
'bbox'
:
block_bbox
,
}
if
block_type
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
block_dict
[
"group_id"
]
=
block
[
-
1
]
block_spans
=
[]
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
...
...
@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
def
fix_block_spans_v2
(
block_with_spans
):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks
=
[]
for
block
in
block_with_spans
:
block_type
=
block
[
'type'
]
if
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
block
=
fix_text_block
(
block
)
elif
block_type
in
[
BlockType
.
InterlineEquation
,
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
block
=
fix_interline_block
(
block
)
else
:
continue
fix_blocks
.
append
(
block
)
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
...
...
magic_pdf/resources/model_config/model_configs.yaml
View file @
7d2dfc80
config
:
device
:
cpu
layout
:
True
formula
:
True
table_config
:
model
:
TableMaster
is_table_recog_enable
:
False
max_time
:
400
weights
:
layout
:
Layout/model_final.pth
mfd
:
MFD/weights.pt
mfr
:
MFR/unimernet_small
layoutlmv3
:
Layout/LayoutLMv3/model_final.pth
doclayout_yolo
:
Layout/YOLO/doclayout_yolo_ft.pt
yolo_v8_mfd
:
MFD/YOLO/yolo_v8_ft.pt
unimernet_small
:
MFR/unimernet_small
struct_eqtable
:
TabRec/StructEqTable
TableMaster
:
TabRec/TableMaster
\ No newline at end of file
tablemaster
:
TabRec/TableMaster
\ No newline at end of file
magic_pdf/tools/cli.py
View file @
7d2dfc80
...
...
@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""",
help
=
"""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
https://paddlepaddle.github.io/PaddleOCR/
latest/
en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
"""
,
default
=
None
,
)
...
...
magic_pdf/tools/common.py
View file @
7d2dfc80
...
...
@@ -6,8 +6,8 @@ import click
from
loguru
import
logger
import
magic_pdf.model
as
model_config
from
magic_pdf.libs.draw_bbox
import
(
draw_layout_bbox
,
draw_
span
_bbox
,
draw_model_bbox
,
draw_
line_sort
_bbox
)
from
magic_pdf.libs.draw_bbox
import
(
draw_layout_bbox
,
draw_
line_sort
_bbox
,
draw_model_bbox
,
draw_
span
_bbox
)
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
...
...
@@ -46,10 +46,12 @@ def do_parse(
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
if
debug_able
:
logger
.
warning
(
'debug mode is on'
)
# f_dump_content_list = True
f_draw_model_bbox
=
True
f_draw_line_sort_bbox
=
True
...
...
@@ -64,13 +66,16 @@ def do_parse(
if
parse_method
==
'auto'
:
jso_useful_key
=
{
'_pdf_type'
:
''
,
'model_list'
:
model_list
}
pipe
=
UNIPipe
(
pdf_bytes
,
jso_useful_key
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
elif
parse_method
==
'txt'
:
pipe
=
TXTPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
elif
parse_method
==
'ocr'
:
pipe
=
OCRPipe
(
pdf_bytes
,
model_list
,
image_writer
,
is_debug
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
else
:
logger
.
error
(
'unknown parse method'
)
exit
(
1
)
...
...
magic_pdf/user_api.py
View file @
7d2dfc80
...
...
@@ -101,11 +101,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
if
pdf_info_dict
is
None
or
pdf_info_dict
.
get
(
"_need_drop"
,
False
):
logger
.
warning
(
f
"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr"
)
if
input_model_is_empty
:
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
layout_model
=
kwargs
.
get
(
"layout_model"
,
None
)
formula_enable
=
kwargs
.
get
(
"formula_enable"
,
None
)
table_enable
=
kwargs
.
get
(
"table_enable"
,
None
)
pdf_models
=
doc_analyze
(
pdf_bytes
,
ocr
=
True
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
pdf_info_dict
=
parse_pdf
(
parse_pdf_by_ocr
)
if
pdf_info_dict
is
None
:
raise
Exception
(
"Both parse_pdf_by_txt and parse_pdf_by_ocr failed."
)
...
...
magic_pdf/utils/__init__.py
0 → 100644
View file @
7d2dfc80
magic_pdf/utils/annotations.py
0 → 100644
View file @
7d2dfc80
from
loguru
import
logger
def
ImportPIL
(
f
):
try
:
import
PIL
# noqa: F401
except
ImportError
:
logger
.
error
(
'Pillow not installed, please install by pip.'
)
exit
(
1
)
return
f
docs/en/.readthedocs.yaml
→
next_
docs/en/.readthedocs.yaml
View file @
7d2dfc80
File moved
docs/en/Makefile
→
next_
docs/en/Makefile
View file @
7d2dfc80
File moved
docs/en/_static/image/logo.png
→
next_
docs/en/_static/image/logo.png
View file @
7d2dfc80
File moved
next_docs/en/api.rst
0 → 100644
View file @
7d2dfc80
Data Api
------------------
.. toctree::
:maxdepth: 2
api/dataset.rst
api/data_reader_writer.rst
api/read_api.rst
next_docs/en/api/data_reader_writer.rst
0 → 100644
View file @
7d2dfc80
Data Reader Writer
--------------------
.. autoclass:: magic_pdf.data.data_reader_writer.DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
:members:
:inherited-members:
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment