Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
6575adea
Unverified
Commit
6575adea
authored
Oct 29, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Oct 29, 2024
Browse files
Merge pull request #808 from opendatalab/dev
Dev->0.9 release
parents
82dd7ac5
37dd55c4
Changes
67
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
480 additions
and
118 deletions
+480
-118
docs/images/flowchart_zh_cn.png
docs/images/flowchart_zh_cn.png
+0
-0
docs/images/layout_example.png
docs/images/layout_example.png
+0
-0
docs/images/poly.png
docs/images/poly.png
+0
-0
docs/images/project_panorama_en.png
docs/images/project_panorama_en.png
+0
-0
docs/images/project_panorama_zh_cn.png
docs/images/project_panorama_zh_cn.png
+0
-0
docs/images/spans_example.png
docs/images/spans_example.png
+0
-0
docs/images/web_demo_1.png
docs/images/web_demo_1.png
+0
-0
docs/output_file_en_us.md
docs/output_file_en_us.md
+0
-0
docs/output_file_zh_cn.md
docs/output_file_zh_cn.md
+0
-0
magic_pdf/config/__init__.py
magic_pdf/config/__init__.py
+0
-0
magic_pdf/dict2md/ocr_mkcontent.py
magic_pdf/dict2md/ocr_mkcontent.py
+11
-11
magic_pdf/libs/Constants.py
magic_pdf/libs/Constants.py
+8
-2
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+39
-13
magic_pdf/model/magic_model.py
magic_pdf/model/magic_model.py
+228
-27
magic_pdf/model/pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+25
-6
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
...f/model/pek_sub_modules/structeqtable/StructTableModel.py
+8
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+102
-32
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+31
-24
magic_pdf/pre_proc/ocr_dict_merge.py
magic_pdf/pre_proc/ocr_dict_merge.py
+27
-1
magic_pdf/tools/cli.py
magic_pdf/tools/cli.py
+1
-1
No files found.
old_
docs/images/flowchart_zh_cn.png
→
docs/images/flowchart_zh_cn.png
View file @
6575adea
File moved
old_
docs/images/layout_example.png
→
docs/images/layout_example.png
View file @
6575adea
File moved
old_
docs/images/poly.png
→
docs/images/poly.png
View file @
6575adea
File moved
old_
docs/images/project_panorama_en.png
→
docs/images/project_panorama_en.png
View file @
6575adea
File moved
old_
docs/images/project_panorama_zh_cn.png
→
docs/images/project_panorama_zh_cn.png
View file @
6575adea
File moved
old_
docs/images/spans_example.png
→
docs/images/spans_example.png
View file @
6575adea
File moved
old_
docs/images/web_demo_1.png
→
docs/images/web_demo_1.png
View file @
6575adea
File moved
old_
docs/output_file_en_us.md
→
docs/output_file_en_us.md
View file @
6575adea
File moved
old_
docs/output_file_zh_cn.md
→
docs/output_file_zh_cn.md
View file @
6575adea
File moved
docs/en/api/io.rst
→
magic_pdf/config/__init__.py
View file @
6575adea
File moved
magic_pdf/dict2md/ocr_mkcontent.py
View file @
6575adea
...
@@ -70,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -70,17 +70,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n

}
)
\n
"
para_text
+=
f
"
\n

}
)
\n
"
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼image_caption
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
for
block
in
para_block
[
'blocks'
]:
#
2n
d.拼image_
caption
for
block
in
para_block
[
'blocks'
]:
#
3r
d.拼image_
footnote
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
if
mode
==
'nlp'
:
if
mode
==
'nlp'
:
continue
continue
elif
mode
==
'mm'
:
elif
mode
==
'mm'
:
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
for
block
in
para_block
[
'blocks'
]:
# 1st.拼table_caption
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
for
block
in
para_block
[
'blocks'
]:
# 2nd.拼table_body
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
...
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
...
@@ -95,7 +95,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
para_text
+=
f
"
\n

}
)
\n
"
para_text
+=
f
"
\n

}
)
\n
"
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
for
block
in
para_block
[
'blocks'
]:
# 3rd.拼table_footnote
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
para_text
+=
merge_para_with_text
(
block
)
+
'
\n
'
if
para_text
.
strip
()
==
''
:
if
para_text
.
strip
()
==
''
:
continue
continue
...
@@ -180,18 +180,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -180,18 +180,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
'text_format'
:
'latex'
,
'text_format'
:
'latex'
,
}
}
elif
para_type
==
BlockType
.
Image
:
elif
para_type
==
BlockType
.
Image
:
para_content
=
{
'type'
:
'image'
}
para_content
=
{
'type'
:
'image'
,
'img_caption'
:
[],
'img_footnote'
:
[]
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
para_content
[
'img_path'
]
=
join_path
(
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
img_buket_path
,
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
block
[
'lines'
][
0
][
'spans'
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_content
[
'img_caption'
]
=
merge_para_with_text
(
block
)
para_content
[
'img_caption'
]
.
append
(
merge_para_with_text
(
block
)
)
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
if
block
[
'type'
]
==
BlockType
.
ImageFootnote
:
para_content
[
'img_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'img_footnote'
]
.
append
(
merge_para_with_text
(
block
)
)
elif
para_type
==
BlockType
.
Table
:
elif
para_type
==
BlockType
.
Table
:
para_content
=
{
'type'
:
'table'
}
para_content
=
{
'type'
:
'table'
,
'table_caption'
:
[],
'table_footnote'
:
[]
}
for
block
in
para_block
[
'blocks'
]:
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
if
block
[
"lines"
][
0
][
"spans"
][
0
].
get
(
'latex'
,
''
):
if
block
[
"lines"
][
0
][
"spans"
][
0
].
get
(
'latex'
,
''
):
...
@@ -200,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
...
@@ -200,9 +200,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
para_content
[
'table_body'
]
=
f
"
\n\n
{
block
[
'lines'
][
0
][
'spans'
][
0
][
'html'
]
}
\n\n
"
para_content
[
'table_body'
]
=
f
"
\n\n
{
block
[
'lines'
][
0
][
'spans'
][
0
][
'html'
]
}
\n\n
"
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
para_content
[
'img_path'
]
=
join_path
(
img_buket_path
,
block
[
"lines"
][
0
][
"spans"
][
0
][
'image_path'
])
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_content
[
'table_caption'
]
=
merge_para_with_text
(
block
)
para_content
[
'table_caption'
]
.
append
(
merge_para_with_text
(
block
)
)
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_content
[
'table_footnote'
]
=
merge_para_with_text
(
block
)
para_content
[
'table_footnote'
]
.
append
(
merge_para_with_text
(
block
)
)
para_content
[
'page_idx'
]
=
page_idx
para_content
[
'page_idx'
]
=
page_idx
...
...
magic_pdf/libs/Constants.py
View file @
6575adea
...
@@ -23,14 +23,20 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
...
@@ -23,14 +23,20 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
TABLE_MASTER_DIR
=
"table_structure_tablemaster_infer/"
TABLE_MASTER_DIR
=
"table_structure_tablemaster_infer/"
# pp detect model dir
# pp detect model dir
DETECT_MODEL_DIR
=
"ch_PP-OCRv
3
_det_infer"
DETECT_MODEL_DIR
=
"ch_PP-OCRv
4
_det_infer"
# pp rec model dir
# pp rec model dir
REC_MODEL_DIR
=
"ch_PP-OCRv
3
_rec_infer"
REC_MODEL_DIR
=
"ch_PP-OCRv
4
_rec_infer"
# pp rec char dict path
# pp rec char dict path
REC_CHAR_DICT
=
"ppocr_keys_v1.txt"
REC_CHAR_DICT
=
"ppocr_keys_v1.txt"
# pp rec copy rec directory
PP_REC_DIRECTORY
=
".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
# pp rec copy det directory
PP_DET_DIRECTORY
=
".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
class
MODEL_NAME
:
class
MODEL_NAME
:
# pp table structure algorithm
# pp table structure algorithm
...
...
magic_pdf/libs/draw_bbox.py
View file @
6575adea
...
@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -141,11 +141,33 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list
=
[]
layout_bbox_list
=
[]
table_type_order
=
{
'table_caption'
:
1
,
'table_body'
:
2
,
'table_footnote'
:
3
}
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_block_list
=
[]
page_block_list
=
[]
for
block
in
page
[
'para_blocks'
]:
for
block
in
page
[
'para_blocks'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
BlockType
.
List
,
BlockType
.
Index
,
]:
bbox
=
block
[
'bbox'
]
bbox
=
block
[
'bbox'
]
page_block_list
.
append
(
bbox
)
page_block_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
BlockType
.
Image
]:
for
sub_block
in
block
[
'blocks'
]:
bbox
=
sub_block
[
'bbox'
]
page_block_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
BlockType
.
Table
]:
sorted_blocks
=
sorted
(
block
[
'blocks'
],
key
=
lambda
x
:
table_type_order
[
x
[
'type'
]])
for
sub_block
in
sorted_blocks
:
bbox
=
sub_block
[
'bbox'
]
page_block_list
.
append
(
bbox
)
layout_bbox_list
.
append
(
page_block_list
)
layout_bbox_list
.
append
(
page_block_list
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
...
@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -153,11 +175,11 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
for
i
,
page
in
enumerate
(
pdf_docs
):
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
158
,
158
,
158
],
True
)
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
158
,
158
,
158
],
True
)
draw_bbox_without_number
(
i
,
tables_list
,
page
,
[
153
,
153
,
0
],
True
)
# color !
#
draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_without_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_without_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_without_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_without_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
draw_bbox_without_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
draw_bbox_without_number
(
i
,
imgs_list
,
page
,
[
51
,
102
,
0
],
True
)
#
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_without_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_without_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_without_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_without_number
(
i
,
imgs_footnote_list
,
page
,
[
255
,
178
,
102
],
True
),
draw_bbox_without_number
(
i
,
imgs_footnote_list
,
page
,
[
255
,
178
,
102
],
True
),
...
@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
...
@@ -338,19 +360,23 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
for
page
in
pdf_info
:
for
page
in
pdf_info
:
page_line_list
=
[]
page_line_list
=
[]
for
block
in
page
[
'preproc_blocks'
]:
for
block
in
page
[
'preproc_blocks'
]:
if
block
[
'type'
]
in
[
't
ext
'
,
't
itle
'
,
'i
nterline
_e
quation
'
]:
if
block
[
'type'
]
in
[
BlockType
.
T
ext
,
BlockType
.
T
itle
,
BlockType
.
I
nterline
E
quation
]:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
if
block
[
'type'
]
in
[
'table'
,
'image'
]:
if
block
[
'type'
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
bbox
=
block
[
'bbox'
]
for
sub_block
in
block
[
'blocks'
]:
index
=
block
[
'index'
]
if
sub_block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
for
line
in
sub_block
[
'virtual_lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
elif
sub_block
[
'type'
]
in
[
BlockType
.
ImageCaption
,
BlockType
.
TableCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableFootnote
]:
for
line
in
sub_block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
# for line in block['lines']:
# bbox = line['bbox']
# index = line['index']
# page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes
=
sorted
(
page_line_list
,
key
=
lambda
x
:
x
[
'index'
])
sorted_bboxes
=
sorted
(
page_line_list
,
key
=
lambda
x
:
x
[
'index'
])
layout_bbox_list
.
append
(
sorted_bbox
[
'bbox'
]
for
sorted_bbox
in
sorted_bboxes
)
layout_bbox_list
.
append
(
sorted_bbox
[
'bbox'
]
for
sorted_bbox
in
sorted_bboxes
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
...
...
magic_pdf/model/magic_model.py
View file @
6575adea
import
enum
import
json
import
json
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
...
@@ -10,6 +11,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
...
@@ -10,6 +11,7 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
from
magic_pdf.libs.local_math
import
float_gt
from
magic_pdf.libs.local_math
import
float_gt
from
magic_pdf.libs.ModelBlockTypeEnum
import
ModelBlockTypeEnum
from
magic_pdf.libs.ModelBlockTypeEnum
import
ModelBlockTypeEnum
from
magic_pdf.libs.ocr_content_type
import
CategoryId
,
ContentType
from
magic_pdf.libs.ocr_content_type
import
CategoryId
,
ContentType
from
magic_pdf.pre_proc.remove_bbox_overlap
import
_remove_overlap_between_bbox
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.AbsReaderWriter
import
AbsReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
from
magic_pdf.rw.DiskReaderWriter
import
DiskReaderWriter
...
@@ -17,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
...
@@ -17,6 +19,14 @@ CAPATION_OVERLAP_AREA_RATIO = 0.6
MERGE_BOX_OVERLAP_AREA_RATIO
=
1.1
MERGE_BOX_OVERLAP_AREA_RATIO
=
1.1
class
PosRelationEnum
(
enum
.
Enum
):
LEFT
=
'left'
RIGHT
=
'right'
UP
=
'up'
BOTTOM
=
'bottom'
ALL
=
'all'
class
MagicModel
:
class
MagicModel
:
"""每个函数没有得到元素的时候返回空list."""
"""每个函数没有得到元素的时候返回空list."""
...
@@ -124,8 +134,7 @@ class MagicModel:
...
@@ -124,8 +134,7 @@ class MagicModel:
l1
=
bbox1
[
2
]
-
bbox1
[
0
]
l1
=
bbox1
[
2
]
-
bbox1
[
0
]
l2
=
bbox2
[
2
]
-
bbox2
[
0
]
l2
=
bbox2
[
2
]
-
bbox2
[
0
]
min_l
,
max_l
=
min
(
l1
,
l2
),
max
(
l1
,
l2
)
if
l2
>
l1
and
(
l2
-
l1
)
/
l1
>
0.3
:
if
(
max_l
-
min_l
)
*
1.0
/
max_l
>
0.4
:
return
float
(
'inf'
)
return
float
(
'inf'
)
return
bbox_distance
(
bbox1
,
bbox2
)
return
bbox_distance
(
bbox1
,
bbox2
)
...
@@ -591,9 +600,24 @@ class MagicModel:
...
@@ -591,9 +600,24 @@ class MagicModel:
return
ret
,
total_subject_object_dis
return
ret
,
total_subject_object_dis
def
__tie_up_category_by_distance_v2
(
def
__tie_up_category_by_distance_v2
(
self
,
page_no
,
subject_category_id
,
object_category_id
self
,
page_no
:
int
,
subject_category_id
:
int
,
object_category_id
:
int
,
priority_pos
:
PosRelationEnum
,
):
):
"""_summary_
Args:
page_no (int): _description_
subject_category_id (int): _description_
object_category_id (int): _description_
priority_pos (PosRelationEnum): _description_
Returns:
_type_: _description_
"""
AXIS_MULPLICITY
=
0.5
subjects
=
self
.
__reduct_overlap
(
subjects
=
self
.
__reduct_overlap
(
list
(
list
(
map
(
map
(
...
@@ -617,67 +641,244 @@ class MagicModel:
...
@@ -617,67 +641,244 @@ class MagicModel:
)
)
)
)
)
)
M
=
len
(
objects
)
subjects
.
sort
(
key
=
lambda
x
:
x
[
'bbox'
][
0
]
**
2
+
x
[
'bbox'
][
1
]
**
2
)
subjects
.
sort
(
key
=
lambda
x
:
x
[
'bbox'
][
0
]
**
2
+
x
[
'bbox'
][
1
]
**
2
)
objects
.
sort
(
key
=
lambda
x
:
x
[
'bbox'
][
0
]
**
2
+
x
[
'bbox'
][
1
]
**
2
)
objects
.
sort
(
key
=
lambda
x
:
x
[
'bbox'
][
0
]
**
2
+
x
[
'bbox'
][
1
]
**
2
)
dis
=
[[
float
(
'inf'
)]
*
len
(
subjects
)
for
_
in
range
(
len
(
objects
))]
sub_obj_map_h
=
{
i
:
[]
for
i
in
range
(
len
(
subjects
))}
dis_by_directions
=
{
'top'
:
[[
-
1
,
float
(
'inf'
)]]
*
M
,
'bottom'
:
[[
-
1
,
float
(
'inf'
)]]
*
M
,
'left'
:
[[
-
1
,
float
(
'inf'
)]]
*
M
,
'right'
:
[[
-
1
,
float
(
'inf'
)]]
*
M
,
}
for
i
,
obj
in
enumerate
(
objects
):
for
i
,
obj
in
enumerate
(
objects
):
l_x_axis
,
l_y_axis
=
(
obj
[
'bbox'
][
2
]
-
obj
[
'bbox'
][
0
],
obj
[
'bbox'
][
3
]
-
obj
[
'bbox'
][
1
],
)
axis_unit
=
min
(
l_x_axis
,
l_y_axis
)
for
j
,
sub
in
enumerate
(
subjects
):
for
j
,
sub
in
enumerate
(
subjects
):
dis
[
i
][
j
]
=
self
.
_bbox_distance
(
sub
[
'bbox'
],
obj
[
'bbox'
])
sub_obj_map_h
=
{
i
:
[]
for
i
in
range
(
len
(
subjects
))}
bbox1
,
bbox2
,
_
=
_remove_overlap_between_bbox
(
for
i
in
range
(
len
(
objects
)):
objects
[
i
][
'bbox'
],
subjects
[
j
][
'bbox'
]
min_l_idx
=
0
)
for
j
in
range
(
1
,
len
(
subjects
)):
left
,
right
,
bottom
,
top
=
bbox_relative_pos
(
bbox1
,
bbox2
)
if
dis
[
i
][
j
]
==
float
(
'inf'
):
flags
=
[
left
,
right
,
bottom
,
top
]
if
sum
([
1
if
v
else
0
for
v
in
flags
])
>
1
:
continue
continue
if
dis
[
i
][
j
]
<
dis
[
i
][
min_l_idx
]:
min_l_idx
=
j
if
dis
[
i
][
min_l_idx
]
<
float
(
'inf'
):
if
left
:
sub_obj_map_h
[
min_l_idx
].
append
(
i
)
if
dis_by_directions
[
'left'
][
i
][
1
]
>
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]
):
dis_by_directions
[
'left'
][
i
]
=
[
j
,
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
]
if
right
:
if
dis_by_directions
[
'right'
][
i
][
1
]
>
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]
):
dis_by_directions
[
'right'
][
i
]
=
[
j
,
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
]
if
bottom
:
if
dis_by_directions
[
'bottom'
][
i
][
1
]
>
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]
):
dis_by_directions
[
'bottom'
][
i
]
=
[
j
,
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
]
if
top
:
if
dis_by_directions
[
'top'
][
i
][
1
]
>
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]
):
dis_by_directions
[
'top'
][
i
]
=
[
j
,
bbox_distance
(
obj
[
'bbox'
],
sub
[
'bbox'
]),
]
if
(
dis_by_directions
[
'top'
][
i
][
1
]
!=
float
(
'inf'
)
and
dis_by_directions
[
'bottom'
][
i
][
1
]
!=
float
(
'inf'
)
and
priority_pos
in
(
PosRelationEnum
.
BOTTOM
,
PosRelationEnum
.
UP
)
):
RATIO
=
3
if
(
abs
(
dis_by_directions
[
'top'
][
i
][
1
]
-
dis_by_directions
[
'bottom'
][
i
][
1
]
)
<
RATIO
*
axis_unit
):
if
priority_pos
==
PosRelationEnum
.
BOTTOM
:
sub_obj_map_h
[
dis_by_directions
[
'bottom'
][
i
][
0
]].
append
(
i
)
else
:
else
:
print
(
i
,
'no nearest'
)
sub_obj_map_h
[
dis_by_directions
[
'top'
][
i
][
0
]].
append
(
i
)
continue
if
dis_by_directions
[
'left'
][
i
][
1
]
!=
float
(
'inf'
)
or
dis_by_directions
[
'right'
][
i
][
1
]
!=
float
(
'inf'
):
if
dis_by_directions
[
'left'
][
i
][
1
]
!=
float
(
'inf'
)
and
dis_by_directions
[
'right'
][
i
][
1
]
!=
float
(
'inf'
):
if
AXIS_MULPLICITY
*
axis_unit
>=
abs
(
dis_by_directions
[
'left'
][
i
][
1
]
-
dis_by_directions
[
'right'
][
i
][
1
]
):
left_sub_bbox
=
subjects
[
dis_by_directions
[
'left'
][
i
][
0
]][
'bbox'
]
right_sub_bbox
=
subjects
[
dis_by_directions
[
'right'
][
i
][
0
]][
'bbox'
]
left_sub_bbox_y_axis
=
left_sub_bbox
[
3
]
-
left_sub_bbox
[
1
]
right_sub_bbox_y_axis
=
right_sub_bbox
[
3
]
-
right_sub_bbox
[
1
]
if
(
abs
(
left_sub_bbox_y_axis
-
l_y_axis
)
+
dis_by_directions
[
'left'
][
i
][
0
]
>
abs
(
right_sub_bbox_y_axis
-
l_y_axis
)
+
dis_by_directions
[
'right'
][
i
][
0
]
):
left_or_right
=
dis_by_directions
[
'right'
][
i
]
else
:
left_or_right
=
dis_by_directions
[
'left'
][
i
]
else
:
left_or_right
=
dis_by_directions
[
'left'
][
i
]
if
left_or_right
[
1
]
>
dis_by_directions
[
'right'
][
i
][
1
]:
left_or_right
=
dis_by_directions
[
'right'
][
i
]
else
:
left_or_right
=
dis_by_directions
[
'left'
][
i
]
if
left_or_right
[
1
]
==
float
(
'inf'
):
left_or_right
=
dis_by_directions
[
'right'
][
i
]
else
:
left_or_right
=
[
-
1
,
float
(
'inf'
)]
if
dis_by_directions
[
'top'
][
i
][
1
]
!=
float
(
'inf'
)
or
dis_by_directions
[
'bottom'
][
i
][
1
]
!=
float
(
'inf'
):
if
dis_by_directions
[
'top'
][
i
][
1
]
!=
float
(
'inf'
)
and
dis_by_directions
[
'bottom'
][
i
][
1
]
!=
float
(
'inf'
):
if
AXIS_MULPLICITY
*
axis_unit
>=
abs
(
dis_by_directions
[
'top'
][
i
][
1
]
-
dis_by_directions
[
'bottom'
][
i
][
1
]
):
top_bottom
=
subjects
[
dis_by_directions
[
'bottom'
][
i
][
0
]][
'bbox'
]
bottom_top
=
subjects
[
dis_by_directions
[
'top'
][
i
][
0
]][
'bbox'
]
top_bottom_x_axis
=
top_bottom
[
2
]
-
top_bottom
[
0
]
bottom_top_x_axis
=
bottom_top
[
2
]
-
bottom_top
[
0
]
if
(
abs
(
top_bottom_x_axis
-
l_x_axis
)
+
dis_by_directions
[
'bottom'
][
i
][
1
]
>
abs
(
bottom_top_x_axis
-
l_x_axis
)
+
dis_by_directions
[
'top'
][
i
][
1
]
):
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
else
:
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
else
:
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
if
top_or_bottom
[
1
]
>
dis_by_directions
[
'bottom'
][
i
][
1
]:
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
else
:
top_or_bottom
=
dis_by_directions
[
'top'
][
i
]
if
top_or_bottom
[
1
]
==
float
(
'inf'
):
top_or_bottom
=
dis_by_directions
[
'bottom'
][
i
]
else
:
top_or_bottom
=
[
-
1
,
float
(
'inf'
)]
if
left_or_right
[
1
]
!=
float
(
'inf'
)
or
top_or_bottom
[
1
]
!=
float
(
'inf'
):
if
left_or_right
[
1
]
!=
float
(
'inf'
)
and
top_or_bottom
[
1
]
!=
float
(
'inf'
):
if
AXIS_MULPLICITY
*
axis_unit
>=
abs
(
left_or_right
[
1
]
-
top_or_bottom
[
1
]
):
y_axis_bbox
=
subjects
[
left_or_right
[
0
]][
'bbox'
]
x_axis_bbox
=
subjects
[
top_or_bottom
[
0
]][
'bbox'
]
if
(
abs
((
x_axis_bbox
[
2
]
-
x_axis_bbox
[
0
])
-
l_x_axis
)
/
l_x_axis
>
abs
((
y_axis_bbox
[
3
]
-
y_axis_bbox
[
1
])
-
l_y_axis
)
/
l_y_axis
):
sub_obj_map_h
[
left_or_right
[
0
]].
append
(
i
)
else
:
sub_obj_map_h
[
top_or_bottom
[
0
]].
append
(
i
)
else
:
if
left_or_right
[
1
]
>
top_or_bottom
[
1
]:
sub_obj_map_h
[
top_or_bottom
[
0
]].
append
(
i
)
else
:
sub_obj_map_h
[
left_or_right
[
0
]].
append
(
i
)
else
:
if
left_or_right
[
1
]
!=
float
(
'inf'
):
sub_obj_map_h
[
left_or_right
[
0
]].
append
(
i
)
else
:
sub_obj_map_h
[
top_or_bottom
[
0
]].
append
(
i
)
ret
=
[]
ret
=
[]
for
i
in
sub_obj_map_h
.
keys
():
for
i
in
sub_obj_map_h
.
keys
():
ret
.
append
(
ret
.
append
(
{
{
'sub_bbox'
:
subjects
[
i
][
'bbox'
],
'sub_bbox'
:
{
'obj_bboxes'
:
[
objects
[
j
][
'bbox'
]
for
j
in
sub_obj_map_h
[
i
]],
'bbox'
:
subjects
[
i
][
'bbox'
],
'score'
:
subjects
[
i
][
'score'
],
},
'obj_bboxes'
:
[
{
'score'
:
objects
[
j
][
'score'
],
'bbox'
:
objects
[
j
][
'bbox'
]}
for
j
in
sub_obj_map_h
[
i
]
],
'sub_idx'
:
i
,
'sub_idx'
:
i
,
}
}
)
)
return
ret
return
ret
def
get_imgs_v2
(
self
,
page_no
:
int
):
def
get_imgs_v2
(
self
,
page_no
:
int
):
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
4
)
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
4
,
PosRelationEnum
.
BOTTOM
)
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
3
,
CategoryId
.
ImageFootnote
page_no
,
3
,
CategoryId
.
ImageFootnote
,
PosRelationEnum
.
ALL
)
)
ret
=
[]
ret
=
[]
for
v
in
with_captions
:
for
v
in
with_captions
:
record
=
{
record
=
{
'image_b
box
'
:
v
[
'sub_bbox'
],
'image_b
ody
'
:
v
[
'sub_bbox'
],
'image_caption_
bbox_
list'
:
v
[
'obj_bboxes'
],
'image_caption_list'
:
v
[
'obj_bboxes'
],
}
}
filter_idx
=
v
[
'sub_idx'
]
filter_idx
=
v
[
'sub_idx'
]
d
=
next
(
filter
(
lambda
x
:
x
[
'sub_idx'
]
==
filter_idx
,
with_footnotes
))
d
=
next
(
filter
(
lambda
x
:
x
[
'sub_idx'
]
==
filter_idx
,
with_footnotes
))
record
[
'image_footnote_
bbox_
list'
]
=
d
[
'obj_bboxes'
]
record
[
'image_footnote_list'
]
=
d
[
'obj_bboxes'
]
ret
.
append
(
record
)
ret
.
append
(
record
)
return
ret
return
ret
def
get_tables_v2
(
self
,
page_no
:
int
)
->
list
:
def
get_tables_v2
(
self
,
page_no
:
int
)
->
list
:
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
6
)
with_captions
=
self
.
__tie_up_category_by_distance_v2
(
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
7
)
page_no
,
5
,
6
,
PosRelationEnum
.
UP
)
with_footnotes
=
self
.
__tie_up_category_by_distance_v2
(
page_no
,
5
,
7
,
PosRelationEnum
.
ALL
)
ret
=
[]
ret
=
[]
for
v
in
with_captions
:
for
v
in
with_captions
:
record
=
{
record
=
{
'table_b
box
'
:
v
[
'sub_bbox'
],
'table_b
ody
'
:
v
[
'sub_bbox'
],
'table_caption_
bbox_
list'
:
v
[
'obj_bboxes'
],
'table_caption_list'
:
v
[
'obj_bboxes'
],
}
}
filter_idx
=
v
[
'sub_idx'
]
filter_idx
=
v
[
'sub_idx'
]
d
=
next
(
filter
(
lambda
x
:
x
[
'sub_idx'
]
==
filter_idx
,
with_footnotes
))
d
=
next
(
filter
(
lambda
x
:
x
[
'sub_idx'
]
==
filter_idx
,
with_footnotes
))
record
[
'table_footnote_
bbox_
list'
]
=
d
[
'obj_bboxes'
]
record
[
'table_footnote_list'
]
=
d
[
'obj_bboxes'
]
ret
.
append
(
record
)
ret
.
append
(
record
)
return
ret
return
ret
...
...
magic_pdf/model/pdf_extract_kit.py
View file @
6575adea
from
loguru
import
logger
from
loguru
import
logger
import
os
import
os
import
time
import
time
from
pathlib
import
Path
import
shutil
from
magic_pdf.libs.Constants
import
*
from
magic_pdf.libs.Constants
import
*
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.model.model_list
import
AtomicModel
from
magic_pdf.model.model_list
import
AtomicModel
...
@@ -37,19 +38,24 @@ except ImportError as e:
...
@@ -37,19 +38,24 @@ except ImportError as e:
from
magic_pdf.model.pek_sub_modules.layoutlmv3.model_init
import
Layoutlmv3_Predictor
from
magic_pdf.model.pek_sub_modules.layoutlmv3.model_init
import
Layoutlmv3_Predictor
from
magic_pdf.model.pek_sub_modules.post_process
import
latex_rm_whitespace
from
magic_pdf.model.pek_sub_modules.post_process
import
latex_rm_whitespace
from
magic_pdf.model.pek_sub_modules.self_modify
import
ModifiedPaddleOCR
from
magic_pdf.model.pek_sub_modules.self_modify
import
ModifiedPaddleOCR
from
magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel
import
StructTableModel
#
from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
from
magic_pdf.model.ppTableModel
import
ppTableModel
from
magic_pdf.model.ppTableModel
import
ppTableModel
def
table_model_init
(
table_model_type
,
model_path
,
max_time
,
_device_
=
'cpu'
):
def
table_model_init
(
table_model_type
,
model_path
,
max_time
,
_device_
=
'cpu'
):
if
table_model_type
==
MODEL_NAME
.
STRUCT_EQTABLE
:
if
table_model_type
==
MODEL_NAME
.
STRUCT_EQTABLE
:
table_model
=
StructTableModel
(
model_path
,
max_time
=
max_time
,
device
=
_device_
)
# table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
else
:
logger
.
error
(
"StructEqTable is under upgrade, the current version does not support it."
)
exit
(
1
)
elif
table_model_type
==
MODEL_NAME
.
TABLE_MASTER
:
config
=
{
config
=
{
"model_dir"
:
model_path
,
"model_dir"
:
model_path
,
"device"
:
_device_
"device"
:
_device_
}
}
table_model
=
ppTableModel
(
config
)
table_model
=
ppTableModel
(
config
)
else
:
logger
.
error
(
"table model type not allow"
)
exit
(
1
)
return
table_model
return
table_model
...
@@ -83,7 +89,7 @@ def doclayout_yolo_model_init(weight):
...
@@ -83,7 +89,7 @@ def doclayout_yolo_model_init(weight):
return
model
return
model
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
None
,
use_dilation
=
True
,
det_db_unclip_ratio
=
2.4
):
def
ocr_model_init
(
show_log
:
bool
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
None
,
use_dilation
=
True
,
det_db_unclip_ratio
=
1.8
):
if
lang
is
not
None
:
if
lang
is
not
None
:
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
,
lang
=
lang
,
use_dilation
=
use_dilation
,
det_db_unclip_ratio
=
det_db_unclip_ratio
)
model
=
ModifiedPaddleOCR
(
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
,
lang
=
lang
,
use_dilation
=
use_dilation
,
det_db_unclip_ratio
=
det_db_unclip_ratio
)
else
:
else
:
...
@@ -297,6 +303,17 @@ class CustomPEKModel:
...
@@ -297,6 +303,17 @@ class CustomPEKModel:
device
=
self
.
device
device
=
self
.
device
)
)
home_directory
=
Path
.
home
()
det_source
=
os
.
path
.
join
(
models_dir
,
table_model_dir
,
DETECT_MODEL_DIR
)
rec_source
=
os
.
path
.
join
(
models_dir
,
table_model_dir
,
REC_MODEL_DIR
)
det_dest_dir
=
os
.
path
.
join
(
home_directory
,
PP_DET_DIRECTORY
)
rec_dest_dir
=
os
.
path
.
join
(
home_directory
,
PP_REC_DIRECTORY
)
if
not
os
.
path
.
exists
(
det_dest_dir
):
shutil
.
copytree
(
det_source
,
det_dest_dir
)
if
not
os
.
path
.
exists
(
rec_dest_dir
):
shutil
.
copytree
(
rec_source
,
rec_dest_dir
)
logger
.
info
(
'DocAnalysis init done!'
)
logger
.
info
(
'DocAnalysis init done!'
)
def
__call__
(
self
,
image
):
def
__call__
(
self
,
image
):
...
@@ -314,7 +331,7 @@ class CustomPEKModel:
...
@@ -314,7 +331,7 @@ class CustomPEKModel:
elif
self
.
layout_model_name
==
MODEL_NAME
.
DocLayout_YOLO
:
elif
self
.
layout_model_name
==
MODEL_NAME
.
DocLayout_YOLO
:
# doclayout_yolo
# doclayout_yolo
layout_res
=
[]
layout_res
=
[]
doclayout_yolo_res
=
self
.
layout_model
.
predict
(
image
,
imgsz
=
1024
,
conf
=
0.
1
5
,
iou
=
0.45
,
verbose
=
True
,
device
=
self
.
device
)[
0
]
doclayout_yolo_res
=
self
.
layout_model
.
predict
(
image
,
imgsz
=
1024
,
conf
=
0.
2
5
,
iou
=
0.45
,
verbose
=
True
,
device
=
self
.
device
)[
0
]
for
xyxy
,
conf
,
cla
in
zip
(
doclayout_yolo_res
.
boxes
.
xyxy
.
cpu
(),
doclayout_yolo_res
.
boxes
.
conf
.
cpu
(),
doclayout_yolo_res
.
boxes
.
cls
.
cpu
()):
for
xyxy
,
conf
,
cla
in
zip
(
doclayout_yolo_res
.
boxes
.
xyxy
.
cpu
(),
doclayout_yolo_res
.
boxes
.
conf
.
cpu
(),
doclayout_yolo_res
.
boxes
.
cls
.
cpu
()):
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
xmin
,
ymin
,
xmax
,
ymax
=
[
int
(
p
.
item
())
for
p
in
xyxy
]
new_item
=
{
new_item
=
{
...
@@ -472,3 +489,5 @@ class CustomPEKModel:
...
@@ -472,3 +489,5 @@ class CustomPEKModel:
logger
.
info
(
f
"-----page total time:
{
round
(
time
.
time
()
-
page_start
,
2
)
}
-----"
)
logger
.
info
(
f
"-----page total time:
{
round
(
time
.
time
()
-
page_start
,
2
)
}
-----"
)
return
layout_res
return
layout_res
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py
View file @
6575adea
from
struct_eqtable.model
import
StructTable
from
loguru
import
logger
try
:
from
struct_eqtable.model
import
StructTable
except
ImportError
:
logger
.
error
(
"StructEqTable is under upgrade, the current version does not support it."
)
from
pypandoc
import
convert_text
from
pypandoc
import
convert_text
class
StructTableModel
:
class
StructTableModel
:
def
__init__
(
self
,
model_path
,
max_new_tokens
=
2048
,
max_time
=
400
,
device
=
'cpu'
):
def
__init__
(
self
,
model_path
,
max_new_tokens
=
2048
,
max_time
=
400
,
device
=
'cpu'
):
# init
# init
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
6575adea
import
copy
import
os
import
os
import
statistics
import
statistics
import
time
import
time
...
@@ -15,7 +16,7 @@ from magic_pdf.libs.convert_utils import dict_to_list
...
@@ -15,7 +16,7 @@ from magic_pdf.libs.convert_utils import dict_to_list
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.drop_reason
import
DropReason
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.local_math
import
float_equal
from
magic_pdf.libs.ocr_content_type
import
ContentType
from
magic_pdf.libs.ocr_content_type
import
ContentType
,
BlockType
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.para.para_split_v3
import
para_split
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
from
magic_pdf.pre_proc.citationmarker_remove
import
remove_citation_marker
...
@@ -29,7 +30,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
...
@@ -29,7 +30,7 @@ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
fill_spans_in_blocks
,
from
magic_pdf.pre_proc.ocr_dict_merge
import
(
fill_spans_in_blocks
,
fix_block_spans
,
fix_block_spans
,
fix_discarded_block
)
fix_discarded_block
,
fix_block_spans_v2
)
from
magic_pdf.pre_proc.ocr_span_list_modify
import
(
from
magic_pdf.pre_proc.ocr_span_list_modify
import
(
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
remove_overlaps_min_spans
)
remove_overlaps_min_spans
)
...
@@ -173,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
...
@@ -173,19 +174,6 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
def
cal_block_index
(
fix_blocks
,
sorted_bboxes
):
def
cal_block_index
(
fix_blocks
,
sorted_bboxes
):
for
block
in
fix_blocks
:
for
block
in
fix_blocks
:
# if block['type'] in ['text', 'title', 'interline_equation']:
# line_index_list = []
# if len(block['lines']) == 0:
# block['index'] = sorted_bboxes.index(block['bbox'])
# else:
# for line in block['lines']:
# line['index'] = sorted_bboxes.index(line['bbox'])
# line_index_list.append(line['index'])
# median_value = statistics.median(line_index_list)
# block['index'] = median_value
#
# elif block['type'] in ['table', 'image']:
# block['index'] = sorted_bboxes.index(block['bbox'])
line_index_list
=
[]
line_index_list
=
[]
if
len
(
block
[
'lines'
])
==
0
:
if
len
(
block
[
'lines'
])
==
0
:
...
@@ -197,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
...
@@ -197,9 +185,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
median_value
=
statistics
.
median
(
line_index_list
)
median_value
=
statistics
.
median
(
line_index_list
)
block
[
'index'
]
=
median_value
block
[
'index'
]
=
median_value
# 删除图表block中的虚拟line信息
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if
block
[
'type'
]
in
[
'table'
,
'image'
]:
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
del
block
[
'lines'
]
block
[
'virtual_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'lines'
]
=
copy
.
deepcopy
(
block
[
'real_lines'
])
del
block
[
'real_lines'
]
return
fix_blocks
return
fix_blocks
...
@@ -218,13 +208,12 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
...
@@ -218,13 +208,12 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
):
# 可能是双列结构,可以切细点
):
# 可能是双列结构,可以切细点
lines
=
int
(
block_height
/
line_height
)
+
1
lines
=
int
(
block_height
/
line_height
)
+
1
else
:
else
:
# 如果block的宽度超过0.4页面宽度,则将block分成3行
# 如果block的宽度超过0.4页面宽度,则将block分成3行
(是一种复杂布局,图不能切的太细)
if
block_weight
>
page_w
*
0.4
:
if
block_weight
>
page_w
*
0.4
:
line_height
=
(
y1
-
y0
)
/
3
line_height
=
(
y1
-
y0
)
/
3
lines
=
3
lines
=
3
elif
block_weight
>
page_w
*
0.25
:
# 否则将block分成两行
elif
block_weight
>
page_w
*
0.25
:
# (可能是三列结构,也切细点)
line_height
=
(
y1
-
y0
)
/
2
lines
=
int
(
block_height
/
line_height
)
+
1
lines
=
2
else
:
# 判断长宽比
else
:
# 判断长宽比
if
block_height
/
block_weight
>
1.2
:
# 细长的不分
if
block_height
/
block_weight
>
1.2
:
# 细长的不分
return
[[
x0
,
y0
,
x1
,
y1
]]
return
[[
x0
,
y0
,
x1
,
y1
]]
...
@@ -250,7 +239,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
...
@@ -250,7 +239,11 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
):
page_line_list
=
[]
page_line_list
=
[]
for
block
in
fix_blocks
:
for
block
in
fix_blocks
:
if
block
[
'type'
]
in
[
'text'
,
'title'
,
'interline_equation'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
if
len
(
block
[
'lines'
])
==
0
:
if
len
(
block
[
'lines'
])
==
0
:
bbox
=
block
[
'bbox'
]
bbox
=
block
[
'bbox'
]
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
...
@@ -261,8 +254,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
...
@@ -261,8 +254,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
bbox
=
line
[
'bbox'
]
page_line_list
.
append
(
bbox
)
page_line_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
'table'
,
'image'
]:
elif
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
bbox
=
block
[
'bbox'
]
bbox
=
block
[
'bbox'
]
block
[
"real_lines"
]
=
copy
.
deepcopy
(
block
[
'lines'
])
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
lines
=
insert_lines_into_block
(
bbox
,
line_height
,
page_w
,
page_h
)
block
[
'lines'
]
=
[]
block
[
'lines'
]
=
[]
for
line
in
lines
:
for
line
in
lines
:
...
@@ -316,7 +310,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
...
@@ -316,7 +310,11 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
def
get_line_height
(
blocks
):
def
get_line_height
(
blocks
):
page_line_height_list
=
[]
page_line_height_list
=
[]
for
block
in
blocks
:
for
block
in
blocks
:
if
block
[
'type'
]
in
[
'text'
,
'title'
,
'interline_equation'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
for
line
in
block
[
'lines'
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
bbox
=
line
[
'bbox'
]
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
...
@@ -326,6 +324,63 @@ def get_line_height(blocks):
...
@@ -326,6 +324,63 @@ def get_line_height(blocks):
return
10
return
10
def
process_groups
(
groups
,
body_key
,
caption_key
,
footnote_key
):
body_blocks
=
[]
caption_blocks
=
[]
footnote_blocks
=
[]
for
i
,
group
in
enumerate
(
groups
):
group
[
body_key
][
'group_id'
]
=
i
body_blocks
.
append
(
group
[
body_key
])
for
caption_block
in
group
[
caption_key
]:
caption_block
[
'group_id'
]
=
i
caption_blocks
.
append
(
caption_block
)
for
footnote_block
in
group
[
footnote_key
]:
footnote_block
[
'group_id'
]
=
i
footnote_blocks
.
append
(
footnote_block
)
return
body_blocks
,
caption_blocks
,
footnote_blocks
def
process_block_list
(
blocks
,
body_type
,
block_type
):
indices
=
[
block
[
'index'
]
for
block
in
blocks
]
median_index
=
statistics
.
median
(
indices
)
body_bbox
=
next
((
block
[
'bbox'
]
for
block
in
blocks
if
block
.
get
(
'type'
)
==
body_type
),
[])
return
{
'type'
:
block_type
,
'bbox'
:
body_bbox
,
'blocks'
:
blocks
,
'index'
:
median_index
,
}
def
revert_group_blocks
(
blocks
):
image_groups
=
{}
table_groups
=
{}
new_blocks
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
image_groups
:
image_groups
[
group_id
]
=
[]
image_groups
[
group_id
].
append
(
block
)
elif
block
[
'type'
]
in
[
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
table_groups
:
table_groups
[
group_id
]
=
[]
table_groups
[
group_id
].
append
(
block
)
else
:
new_blocks
.
append
(
block
)
for
group_id
,
blocks
in
image_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
ImageBody
,
BlockType
.
Image
))
for
group_id
,
blocks
in
table_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
TableBody
,
BlockType
.
Table
))
return
new_blocks
def
parse_page_core
(
def
parse_page_core
(
page_doc
:
PageableData
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
page_doc
:
PageableData
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
):
):
...
@@ -333,8 +388,20 @@ def parse_page_core(
...
@@ -333,8 +388,20 @@ def parse_page_core(
drop_reason
=
[]
drop_reason
=
[]
"""从magic_model对象中获取后面会用到的区块信息"""
"""从magic_model对象中获取后面会用到的区块信息"""
img_blocks
=
magic_model
.
get_imgs
(
page_id
)
# img_blocks = magic_model.get_imgs(page_id)
table_blocks
=
magic_model
.
get_tables
(
page_id
)
# table_blocks = magic_model.get_tables(page_id)
img_groups
=
magic_model
.
get_imgs_v2
(
page_id
)
table_groups
=
magic_model
.
get_tables_v2
(
page_id
)
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
=
process_groups
(
img_groups
,
'image_body'
,
'image_caption_list'
,
'image_footnote_list'
)
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
=
process_groups
(
table_groups
,
'table_body'
,
'table_caption_list'
,
'table_footnote_list'
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
...
@@ -370,8 +437,8 @@ def parse_page_core(
...
@@ -370,8 +437,8 @@ def parse_page_core(
interline_equation_blocks
=
[]
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
img_
body_blocks
,
img_caption_blocks
,
img_footnote_
blocks
,
table_blocks
,
table_
body_blocks
,
table_caption_blocks
,
table_footnote_
blocks
,
discarded_blocks
,
discarded_blocks
,
text_blocks
,
text_blocks
,
title_blocks
,
title_blocks
,
...
@@ -381,8 +448,8 @@ def parse_page_core(
...
@@ -381,8 +448,8 @@ def parse_page_core(
)
)
else
:
else
:
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
all_bboxes
,
all_discarded_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
img_
body_blocks
,
img_caption_blocks
,
img_footnote_
blocks
,
table_blocks
,
table_
body_blocks
,
table_caption_blocks
,
table_footnote_
blocks
,
discarded_blocks
,
discarded_blocks
,
text_blocks
,
text_blocks
,
title_blocks
,
title_blocks
,
...
@@ -419,7 +486,7 @@ def parse_page_core(
...
@@ -419,7 +486,7 @@ def parse_page_core(
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.5
)
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.5
)
"""对block进行fix操作"""
"""对block进行fix操作"""
fix_blocks
=
fix_block_spans
(
block_with_spans
,
img_blocks
,
table_blocks
)
fix_blocks
=
fix_block_spans
_v2
(
block_with_spans
)
"""获取所有line并计算正文line的高度"""
"""获取所有line并计算正文line的高度"""
line_height
=
get_line_height
(
fix_blocks
)
line_height
=
get_line_height
(
fix_blocks
)
...
@@ -430,6 +497,9 @@ def parse_page_core(
...
@@ -430,6 +497,9 @@ def parse_page_core(
"""根据line的中位数算block的序列关系"""
"""根据line的中位数算block的序列关系"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
"""将image和table的block还原回group形式参与后续流程"""
fix_blocks
=
revert_group_blocks
(
fix_blocks
)
"""重排block"""
"""重排block"""
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
...
...
magic_pdf/pre_proc/ocr_detect_all_bboxes.py
View file @
6575adea
...
@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
...
@@ -60,29 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
return
all_bboxes
,
all_discarded_blocks
,
drop_reasons
return
all_bboxes
,
all_discarded_blocks
,
drop_reasons
def
ocr_prepare_bboxes_for_layout_split_v2
(
img_blocks
,
table_blocks
,
discarded_blocks
,
text_blocks
,
def
add_bboxes
(
blocks
,
block_type
,
bboxes
):
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
for
block
in
blocks
:
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
if
block_type
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
"score"
],
block
[
"group_id"
]])
else
:
bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
block_type
,
None
,
None
,
None
,
None
,
block
[
"score"
]])
def
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
):
all_bboxes
=
[]
all_bboxes
=
[]
all_discarded_blocks
=
[]
for
image
in
img_blocks
:
x0
,
y0
,
x1
,
y1
=
image
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Image
,
None
,
None
,
None
,
None
,
image
[
"score"
]])
for
table
in
table_blocks
:
add_bboxes
(
img_body_blocks
,
BlockType
.
ImageBody
,
all_bboxes
)
x0
,
y0
,
x1
,
y1
=
table
[
'bbox'
]
add_bboxes
(
img_caption_blocks
,
BlockType
.
ImageCaption
,
all_bboxes
)
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Table
,
None
,
None
,
None
,
None
,
table
[
"score"
]])
add_bboxes
(
img_footnote_blocks
,
BlockType
.
ImageFootnote
,
all_bboxes
)
add_bboxes
(
table_body_blocks
,
BlockType
.
TableBody
,
all_bboxes
)
for
text
in
text_blocks
:
add_bboxes
(
table_caption_blocks
,
BlockType
.
TableCaption
,
all_bboxes
)
x0
,
y0
,
x1
,
y1
=
text
[
'bbox'
]
add_bboxes
(
table_footnote_blocks
,
BlockType
.
TableFootnote
,
all_bboxes
)
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Text
,
None
,
None
,
None
,
None
,
text
[
"score"
]])
add_bboxes
(
text_blocks
,
BlockType
.
Text
,
all_bboxes
)
add_bboxes
(
title_blocks
,
BlockType
.
Title
,
all_bboxes
)
for
title
in
title_blocks
:
add_bboxes
(
interline_equation_blocks
,
BlockType
.
InterlineEquation
,
all_bboxes
)
x0
,
y0
,
x1
,
y1
=
title
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Title
,
None
,
None
,
None
,
None
,
title
[
"score"
]])
for
interline_equation
in
interline_equation_blocks
:
x0
,
y0
,
x1
,
y1
=
interline_equation
[
'bbox'
]
all_bboxes
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
InterlineEquation
,
None
,
None
,
None
,
None
,
interline_equation
[
"score"
]])
'''block嵌套问题解决'''
'''block嵌套问题解决'''
'''文本框与标题框重叠,优先信任文本框'''
'''文本框与标题框重叠,优先信任文本框'''
...
@@ -96,12 +101,14 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
...
@@ -96,12 +101,14 @@ def ocr_prepare_bboxes_for_layout_split_v2(img_blocks, table_blocks, discarded_b
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
'''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框'''
# 通过后续大框套小框逻辑删除
# 通过后续大框套小框逻辑删除
'''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)'''
'''discarded_blocks'''
all_discarded_blocks
=
[]
add_bboxes
(
discarded_blocks
,
BlockType
.
Discarded
,
all_discarded_blocks
)
'''footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的'''
footnote_blocks
=
[]
footnote_blocks
=
[]
for
discarded
in
discarded_blocks
:
for
discarded
in
discarded_blocks
:
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
x0
,
y0
,
x1
,
y1
=
discarded
[
'bbox'
]
all_discarded_blocks
.
append
([
x0
,
y0
,
x1
,
y1
,
None
,
None
,
None
,
BlockType
.
Discarded
,
None
,
None
,
None
,
None
,
discarded
[
"score"
]])
# 将footnote加入到all_bboxes中,用来计算layout
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
if
(
x1
-
x0
)
>
(
page_w
/
3
)
and
(
y1
-
y0
)
>
10
and
y0
>
(
page_h
/
2
):
footnote_blocks
.
append
([
x0
,
y0
,
x1
,
y1
])
footnote_blocks
.
append
([
x0
,
y0
,
x1
,
y1
])
...
...
magic_pdf/pre_proc/ocr_dict_merge.py
View file @
6575adea
...
@@ -49,7 +49,7 @@ def merge_spans_to_line(spans):
...
@@ -49,7 +49,7 @@ def merge_spans_to_line(spans):
continue
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
0.
6
):
if
__is_overlaps_y_exceeds_threshold
(
span
[
'bbox'
],
current_line
[
-
1
][
'bbox'
],
0.
5
):
current_line
.
append
(
span
)
current_line
.
append
(
span
)
else
:
else
:
# 否则,开始新行
# 否则,开始新行
...
@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
...
@@ -153,6 +153,11 @@ def fill_spans_in_blocks(blocks, spans, radio):
'type'
:
block_type
,
'type'
:
block_type
,
'bbox'
:
block_bbox
,
'bbox'
:
block_bbox
,
}
}
if
block_type
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
block_dict
[
"group_id"
]
=
block
[
-
1
]
block_spans
=
[]
block_spans
=
[]
for
span
in
spans
:
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
span_bbox
=
span
[
'bbox'
]
...
@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
...
@@ -201,6 +206,27 @@ def fix_block_spans(block_with_spans, img_blocks, table_blocks):
return
fix_blocks
return
fix_blocks
def
fix_block_spans_v2
(
block_with_spans
):
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks
=
[]
for
block
in
block_with_spans
:
block_type
=
block
[
'type'
]
if
block_type
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
block
=
fix_text_block
(
block
)
elif
block_type
in
[
BlockType
.
InterlineEquation
,
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
block
=
fix_interline_block
(
block
)
else
:
continue
fix_blocks
.
append
(
block
)
return
fix_blocks
def
fix_discarded_block
(
discarded_block_with_spans
):
def
fix_discarded_block
(
discarded_block_with_spans
):
fix_discarded_blocks
=
[]
fix_discarded_blocks
=
[]
for
block
in
discarded_block_with_spans
:
for
block
in
discarded_block_with_spans
:
...
...
magic_pdf/tools/cli.py
View file @
6575adea
...
@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""",
...
@@ -52,7 +52,7 @@ without method specified, auto will be used by default.""",
help
=
"""
help
=
"""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
https://paddlepaddle.github.io/PaddleOCR/
latest/
en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
"""
,
"""
,
default
=
None
,
default
=
None
,
)
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment