Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0c7a0882
Unverified
Commit
0c7a0882
authored
Jun 12, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jun 12, 2025
Browse files
Merge pull request #2611 from myhloli/dev
Dev
parents
3bd0ecf1
a392f445
Changes
262
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1822 deletions
+0
-1822
magic_pdf/integrations/rag/type.py
magic_pdf/integrations/rag/type.py
+0
-82
magic_pdf/integrations/rag/utils.py
magic_pdf/integrations/rag/utils.py
+0
-284
magic_pdf/libs/__init__.py
magic_pdf/libs/__init__.py
+0
-0
magic_pdf/libs/clean_memory.py
magic_pdf/libs/clean_memory.py
+0
-17
magic_pdf/libs/commons.py
magic_pdf/libs/commons.py
+0
-43
magic_pdf/libs/convert_utils.py
magic_pdf/libs/convert_utils.py
+0
-5
magic_pdf/libs/coordinate_transform.py
magic_pdf/libs/coordinate_transform.py
+0
-9
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+0
-418
magic_pdf/libs/json_compressor.py
magic_pdf/libs/json_compressor.py
+0
-27
magic_pdf/libs/local_math.py
magic_pdf/libs/local_math.py
+0
-9
magic_pdf/libs/markdown_utils.py
magic_pdf/libs/markdown_utils.py
+0
-10
magic_pdf/libs/pdf_check.py
magic_pdf/libs/pdf_check.py
+0
-99
magic_pdf/libs/pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+0
-63
magic_pdf/libs/performance_stats.py
magic_pdf/libs/performance_stats.py
+0
-65
magic_pdf/libs/safe_filename.py
magic_pdf/libs/safe_filename.py
+0
-11
magic_pdf/libs/version.py
magic_pdf/libs/version.py
+0
-1
magic_pdf/model/__init__.py
magic_pdf/model/__init__.py
+0
-2
magic_pdf/model/doc_analyze_by_custom_model.py
magic_pdf/model/doc_analyze_by_custom_model.py
+0
-301
magic_pdf/model/pdf_extract_kit.py
magic_pdf/model/pdf_extract_kit.py
+0
-266
magic_pdf/model/pp_structure_v2.py
magic_pdf/model/pp_structure_v2.py
+0
-110
No files found.
magic_pdf/integrations/rag/type.py
deleted
100644 → 0
View file @
3bd0ecf1
from
enum
import
Enum
from
pydantic
import
BaseModel
,
Field
# rag
class
CategoryType
(
Enum
):
# py310 not support StrEnum
text
=
'text'
title
=
'title'
interline_equation
=
'interline_equation'
image
=
'image'
image_body
=
'image_body'
image_caption
=
'image_caption'
table
=
'table'
table_body
=
'table_body'
table_caption
=
'table_caption'
table_footnote
=
'table_footnote'
class
ElementRelType
(
Enum
):
sibling
=
'sibling'
class
PageInfo
(
BaseModel
):
page_no
:
int
=
Field
(
description
=
'the index of page, start from zero'
,
ge
=
0
)
height
:
int
=
Field
(
description
=
'the height of page'
,
gt
=
0
)
width
:
int
=
Field
(
description
=
'the width of page'
,
ge
=
0
)
image_path
:
str
|
None
=
Field
(
description
=
'the image of this page'
,
default
=
None
)
class
ContentObject
(
BaseModel
):
category_type
:
CategoryType
=
Field
(
description
=
'类别'
)
poly
:
list
[
float
]
=
Field
(
description
=
(
'Coordinates, need to convert back to PDF coordinates,'
' order is top-left, top-right, bottom-right, bottom-left'
' x,y coordinates'
))
ignore
:
bool
=
Field
(
description
=
'whether ignore this object'
,
default
=
False
)
text
:
str
|
None
=
Field
(
description
=
'text content of the object'
,
default
=
None
)
image_path
:
str
|
None
=
Field
(
description
=
'path of embedded image'
,
default
=
None
)
order
:
int
=
Field
(
description
=
'the order of this object within a page'
,
default
=-
1
)
anno_id
:
int
=
Field
(
description
=
'unique id'
,
default
=-
1
)
latex
:
str
|
None
=
Field
(
description
=
'latex result'
,
default
=
None
)
html
:
str
|
None
=
Field
(
description
=
'html result'
,
default
=
None
)
class
ElementRelation
(
BaseModel
):
source_anno_id
:
int
=
Field
(
description
=
'unique id of the source object'
,
default
=-
1
)
target_anno_id
:
int
=
Field
(
description
=
'unique id of the target object'
,
default
=-
1
)
relation
:
ElementRelType
=
Field
(
description
=
'the relation between source and target element'
)
class
LayoutElementsExtra
(
BaseModel
):
element_relation
:
list
[
ElementRelation
]
=
Field
(
description
=
'the relation between source and target element'
)
class
LayoutElements
(
BaseModel
):
layout_dets
:
list
[
ContentObject
]
=
Field
(
description
=
'layout element details'
)
page_info
:
PageInfo
=
Field
(
description
=
'page info'
)
extra
:
LayoutElementsExtra
=
Field
(
description
=
'extra information'
)
# iter data format
class
Node
(
BaseModel
):
category_type
:
CategoryType
=
Field
(
description
=
'类别'
)
text
:
str
|
None
=
Field
(
description
=
'text content of the object'
,
default
=
None
)
image_path
:
str
|
None
=
Field
(
description
=
'path of embedded image'
,
default
=
None
)
anno_id
:
int
=
Field
(
description
=
'unique id'
,
default
=-
1
)
latex
:
str
|
None
=
Field
(
description
=
'latex result'
,
default
=
None
)
html
:
str
|
None
=
Field
(
description
=
'html result'
,
default
=
None
)
magic_pdf/integrations/rag/utils.py
deleted
100644 → 0
View file @
3bd0ecf1
import
json
import
os
from
pathlib
import
Path
from
loguru
import
logger
import
magic_pdf.model
as
model_config
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.data.data_reader_writer
import
FileBasedDataReader
from
magic_pdf.dict2md.ocr_mkcontent
import
merge_para_with_text
from
magic_pdf.integrations.rag.type
import
(
CategoryType
,
ContentObject
,
ElementRelation
,
ElementRelType
,
LayoutElements
,
LayoutElementsExtra
,
PageInfo
)
from
magic_pdf.tools.common
import
do_parse
,
prepare_env
def
convert_middle_json_to_layout_elements
(
json_data
:
dict
,
output_dir
:
str
,
)
->
list
[
LayoutElements
]:
uniq_anno_id
=
0
res
:
list
[
LayoutElements
]
=
[]
for
page_no
,
page_data
in
enumerate
(
json_data
[
'pdf_info'
]):
order_id
=
0
page_info
=
PageInfo
(
height
=
int
(
page_data
[
'page_size'
][
1
]),
width
=
int
(
page_data
[
'page_size'
][
0
]),
page_no
=
page_no
,
)
layout_dets
:
list
[
ContentObject
]
=
[]
extra_element_relation
:
list
[
ElementRelation
]
=
[]
for
para_block
in
page_data
[
'para_blocks'
]:
para_text
=
''
para_type
=
para_block
[
'type'
]
if
para_type
==
BlockType
.
Text
:
para_text
=
merge_para_with_text
(
para_block
)
x0
,
y0
,
x1
,
y1
=
para_block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
text
,
text
=
para_text
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
elif
para_type
==
BlockType
.
Title
:
para_text
=
merge_para_with_text
(
para_block
)
x0
,
y0
,
x1
,
y1
=
para_block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
title
,
text
=
para_text
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
elif
para_type
==
BlockType
.
InterlineEquation
:
para_text
=
merge_para_with_text
(
para_block
)
x0
,
y0
,
x1
,
y1
=
para_block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
interline_equation
,
text
=
para_text
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
elif
para_type
==
BlockType
.
Image
:
body_anno_id
=
-
1
caption_anno_id
=
-
1
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
ImageBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Image
:
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
image_body
,
image_path
=
os
.
path
.
join
(
output_dir
,
span
[
'image_path'
]),
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
body_anno_id
=
uniq_anno_id
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
ImageCaption
:
para_text
+=
merge_para_with_text
(
block
)
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
image_caption
,
text
=
para_text
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
caption_anno_id
=
uniq_anno_id
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
if
body_anno_id
>
0
and
caption_anno_id
>
0
:
element_relation
=
ElementRelation
(
relation
=
ElementRelType
.
sibling
,
source_anno_id
=
body_anno_id
,
target_anno_id
=
caption_anno_id
,
)
extra_element_relation
.
append
(
element_relation
)
elif
para_type
==
BlockType
.
Table
:
body_anno_id
,
caption_anno_id
,
footnote_anno_id
=
-
1
,
-
1
,
-
1
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableCaption
:
para_text
+=
merge_para_with_text
(
block
)
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
table_caption
,
text
=
para_text
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
caption_anno_id
=
uniq_anno_id
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableBody
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
span
[
'type'
]
==
ContentType
.
Table
:
x0
,
y0
,
x1
,
y1
=
para_block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
table_body
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
body_anno_id
=
uniq_anno_id
uniq_anno_id
+=
1
order_id
+=
1
# if processed by table model
if
span
.
get
(
'latex'
,
''
):
content
.
latex
=
span
[
'latex'
]
else
:
content
.
image_path
=
os
.
path
.
join
(
output_dir
,
span
[
'image_path'
])
layout_dets
.
append
(
content
)
for
block
in
para_block
[
'blocks'
]:
if
block
[
'type'
]
==
BlockType
.
TableFootnote
:
para_text
+=
merge_para_with_text
(
block
)
x0
,
y0
,
x1
,
y1
=
block
[
'bbox'
]
content
=
ContentObject
(
anno_id
=
uniq_anno_id
,
category_type
=
CategoryType
.
table_footnote
,
text
=
para_text
,
order
=
order_id
,
poly
=
[
x0
,
y0
,
x1
,
y0
,
x1
,
y1
,
x0
,
y1
],
)
footnote_anno_id
=
uniq_anno_id
uniq_anno_id
+=
1
order_id
+=
1
layout_dets
.
append
(
content
)
if
caption_anno_id
!=
-
1
and
body_anno_id
!=
-
1
:
element_relation
=
ElementRelation
(
relation
=
ElementRelType
.
sibling
,
source_anno_id
=
body_anno_id
,
target_anno_id
=
caption_anno_id
,
)
extra_element_relation
.
append
(
element_relation
)
if
footnote_anno_id
!=
-
1
and
body_anno_id
!=
-
1
:
element_relation
=
ElementRelation
(
relation
=
ElementRelType
.
sibling
,
source_anno_id
=
body_anno_id
,
target_anno_id
=
footnote_anno_id
,
)
extra_element_relation
.
append
(
element_relation
)
res
.
append
(
LayoutElements
(
page_info
=
page_info
,
layout_dets
=
layout_dets
,
extra
=
LayoutElementsExtra
(
element_relation
=
extra_element_relation
),
))
return
res
def
inference
(
path
,
output_dir
,
method
):
model_config
.
__use_inside_model__
=
True
model_config
.
__model_mode__
=
'full'
if
output_dir
==
''
:
if
os
.
path
.
isdir
(
path
):
output_dir
=
os
.
path
.
join
(
path
,
'output'
)
else
:
output_dir
=
os
.
path
.
join
(
os
.
path
.
dirname
(
path
),
'output'
)
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
str
(
Path
(
path
).
stem
),
method
)
def
read_fn
(
path
):
disk_rw
=
FileBasedDataReader
(
os
.
path
.
dirname
(
path
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
))
def
parse_doc
(
doc_path
:
str
):
try
:
file_name
=
str
(
Path
(
doc_path
).
stem
)
pdf_data
=
read_fn
(
doc_path
)
do_parse
(
output_dir
,
file_name
,
pdf_data
,
[],
method
,
False
,
f_draw_span_bbox
=
False
,
f_draw_layout_bbox
=
False
,
f_dump_md
=
False
,
f_dump_middle_json
=
True
,
f_dump_model_json
=
False
,
f_dump_orig_pdf
=
False
,
f_dump_content_list
=
False
,
f_draw_model_bbox
=
False
,
)
middle_json_fn
=
os
.
path
.
join
(
local_md_dir
,
f
'
{
file_name
}
_middle.json'
)
with
open
(
middle_json_fn
)
as
fd
:
jso
=
json
.
load
(
fd
)
os
.
remove
(
middle_json_fn
)
return
convert_middle_json_to_layout_elements
(
jso
,
local_image_dir
)
except
Exception
as
e
:
logger
.
exception
(
e
)
return
parse_doc
(
path
)
if
__name__
==
'__main__'
:
import
pprint
base_dir
=
'/opt/data/pdf/resources/samples/'
if
0
:
with
open
(
base_dir
+
'json_outputs/middle.json'
)
as
f
:
d
=
json
.
load
(
f
)
result
=
convert_middle_json_to_layout_elements
(
d
,
'/tmp'
)
pprint
.
pp
(
result
)
if
0
:
with
open
(
base_dir
+
'json_outputs/middle.3.json'
)
as
f
:
d
=
json
.
load
(
f
)
result
=
convert_middle_json_to_layout_elements
(
d
,
'/tmp'
)
pprint
.
pp
(
result
)
if
1
:
res
=
inference
(
base_dir
+
'samples/pdf/one_page_with_table_image.pdf'
,
'/tmp/output'
,
'ocr'
,
)
pprint
.
pp
(
res
)
magic_pdf/libs/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/libs/clean_memory.py
deleted
100644 → 0
View file @
3bd0ecf1
# Copyright (c) Opendatalab. All rights reserved.
import
torch
import
gc
def
clean_memory
(
device
=
'cuda'
):
if
device
==
'cuda'
:
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
ipc_collect
()
elif
str
(
device
).
startswith
(
"npu"
):
import
torch_npu
if
torch_npu
.
npu
.
is_available
():
torch_npu
.
npu
.
empty_cache
()
elif
str
(
device
).
startswith
(
"mps"
):
torch
.
mps
.
empty_cache
()
gc
.
collect
()
\ No newline at end of file
magic_pdf/libs/commons.py
deleted
100644 → 0
View file @
3bd0ecf1
def
join_path
(
*
args
):
return
'/'
.
join
(
str
(
s
).
rstrip
(
'/'
)
for
s
in
args
)
def
get_top_percent_list
(
num_list
,
percent
):
"""
获取列表中前百分之多少的元素
:param num_list:
:param percent:
:return:
"""
if
len
(
num_list
)
==
0
:
top_percent_list
=
[]
else
:
# 对imgs_len_list排序
sorted_imgs_len_list
=
sorted
(
num_list
,
reverse
=
True
)
# 计算 percent 的索引
top_percent_index
=
int
(
len
(
sorted_imgs_len_list
)
*
percent
)
# 取前80%的元素
top_percent_list
=
sorted_imgs_len_list
[:
top_percent_index
]
return
top_percent_list
def
mymax
(
alist
:
list
):
if
len
(
alist
)
==
0
:
return
0
# 空是0, 0*0也是0大小q
else
:
return
max
(
alist
)
def
parse_bucket_key
(
s3_full_path
:
str
):
"""
输入 s3://bucket/path/to/my/file.txt
输出 bucket, path/to/my/file.txt
"""
s3_full_path
=
s3_full_path
.
strip
()
if
s3_full_path
.
startswith
(
"s3://"
):
s3_full_path
=
s3_full_path
[
5
:]
if
s3_full_path
.
startswith
(
"/"
):
s3_full_path
=
s3_full_path
[
1
:]
bucket
,
key
=
s3_full_path
.
split
(
"/"
,
1
)
return
bucket
,
key
magic_pdf/libs/convert_utils.py
deleted
100644 → 0
View file @
3bd0ecf1
def
dict_to_list
(
input_dict
):
items_list
=
[]
for
_
,
item
in
input_dict
.
items
():
items_list
.
append
(
item
)
return
items_list
magic_pdf/libs/coordinate_transform.py
deleted
100644 → 0
View file @
3bd0ecf1
def
get_scale_ratio
(
model_page_info
,
page
):
pix
=
page
.
get_pixmap
(
dpi
=
72
)
pymu_width
=
int
(
pix
.
w
)
pymu_height
=
int
(
pix
.
h
)
width_from_json
=
model_page_info
[
'page_info'
][
'width'
]
height_from_json
=
model_page_info
[
'page_info'
][
'height'
]
horizontal_scale_ratio
=
width_from_json
/
pymu_width
vertical_scale_ratio
=
height_from_json
/
pymu_height
return
horizontal_scale_ratio
,
vertical_scale_ratio
magic_pdf/libs/draw_bbox.py
deleted
100644 → 0
View file @
3bd0ecf1
import
fitz
from
magic_pdf.config.constants
import
CROSS_PAGE
from
magic_pdf.config.ocr_content_type
import
(
BlockType
,
CategoryId
,
ContentType
)
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.model.magic_model
import
MagicModel
def
draw_bbox_without_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
):
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
new_rgb
.
append
(
item
)
page_data
=
bbox_list
[
i
]
for
bbox
in
page_data
:
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
if
fill_config
:
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
else
:
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
def
draw_bbox_with_number
(
i
,
bbox_list
,
page
,
rgb_config
,
fill_config
,
draw_bbox
=
True
):
new_rgb
=
[]
for
item
in
rgb_config
:
item
=
float
(
item
)
/
255
new_rgb
.
append
(
item
)
page_data
=
bbox_list
[
i
]
for
j
,
bbox
in
enumerate
(
page_data
):
x0
,
y0
,
x1
,
y1
=
bbox
rect_coords
=
fitz
.
Rect
(
x0
,
y0
,
x1
,
y1
)
# Define the rectangle
if
draw_bbox
:
if
fill_config
:
page
.
draw_rect
(
rect_coords
,
color
=
None
,
fill
=
new_rgb
,
fill_opacity
=
0.3
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
else
:
page
.
draw_rect
(
rect_coords
,
color
=
new_rgb
,
fill
=
None
,
fill_opacity
=
1
,
width
=
0.5
,
overlay
=
True
,
)
# Draw the rectangle
page
.
insert_text
(
(
x1
+
2
,
y0
+
10
),
str
(
j
+
1
),
fontsize
=
10
,
color
=
new_rgb
)
# Insert the index in the top left corner of the rectangle
def
draw_layout_bbox
(
pdf_info
,
pdf_bytes
,
out_path
,
filename
):
dropped_bbox_list
=
[]
tables_list
,
tables_body_list
=
[],
[]
tables_caption_list
,
tables_footnote_list
=
[],
[]
imgs_list
,
imgs_body_list
,
imgs_caption_list
=
[],
[],
[]
imgs_footnote_list
=
[]
titles_list
=
[]
texts_list
=
[]
interequations_list
=
[]
lists_list
=
[]
indexs_list
=
[]
for
page
in
pdf_info
:
page_dropped_list
=
[]
tables
,
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[],
[]
imgs
,
imgs_body
,
imgs_caption
,
imgs_footnote
=
[],
[],
[],
[]
titles
=
[]
texts
=
[]
interequations
=
[]
lists
=
[]
indices
=
[]
for
dropped_bbox
in
page
[
'discarded_blocks'
]:
page_dropped_list
.
append
(
dropped_bbox
[
'bbox'
])
dropped_bbox_list
.
append
(
page_dropped_list
)
for
block
in
page
[
'para_blocks'
]:
bbox
=
block
[
'bbox'
]
if
block
[
'type'
]
==
BlockType
.
Table
:
tables
.
append
(
bbox
)
for
nested_block
in
block
[
'blocks'
]:
bbox
=
nested_block
[
'bbox'
]
if
nested_block
[
'type'
]
==
BlockType
.
TableBody
:
tables_body
.
append
(
bbox
)
elif
nested_block
[
'type'
]
==
BlockType
.
TableCaption
:
tables_caption
.
append
(
bbox
)
elif
nested_block
[
'type'
]
==
BlockType
.
TableFootnote
:
tables_footnote
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
Image
:
imgs
.
append
(
bbox
)
for
nested_block
in
block
[
'blocks'
]:
bbox
=
nested_block
[
'bbox'
]
if
nested_block
[
'type'
]
==
BlockType
.
ImageBody
:
imgs_body
.
append
(
bbox
)
elif
nested_block
[
'type'
]
==
BlockType
.
ImageCaption
:
imgs_caption
.
append
(
bbox
)
elif
nested_block
[
'type'
]
==
BlockType
.
ImageFootnote
:
imgs_footnote
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
Title
:
titles
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
Text
:
texts
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
InterlineEquation
:
interequations
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
List
:
lists
.
append
(
bbox
)
elif
block
[
'type'
]
==
BlockType
.
Index
:
indices
.
append
(
bbox
)
tables_list
.
append
(
tables
)
tables_body_list
.
append
(
tables_body
)
tables_caption_list
.
append
(
tables_caption
)
tables_footnote_list
.
append
(
tables_footnote
)
imgs_list
.
append
(
imgs
)
imgs_body_list
.
append
(
imgs_body
)
imgs_caption_list
.
append
(
imgs_caption
)
imgs_footnote_list
.
append
(
imgs_footnote
)
titles_list
.
append
(
titles
)
texts_list
.
append
(
texts
)
interequations_list
.
append
(
interequations
)
lists_list
.
append
(
lists
)
indexs_list
.
append
(
indices
)
layout_bbox_list
=
[]
table_type_order
=
{
'table_caption'
:
1
,
'table_body'
:
2
,
'table_footnote'
:
3
}
for
page
in
pdf_info
:
page_block_list
=
[]
for
block
in
page
[
'para_blocks'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
BlockType
.
List
,
BlockType
.
Index
,
]:
bbox
=
block
[
'bbox'
]
page_block_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
BlockType
.
Image
]:
for
sub_block
in
block
[
'blocks'
]:
bbox
=
sub_block
[
'bbox'
]
page_block_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
BlockType
.
Table
]:
sorted_blocks
=
sorted
(
block
[
'blocks'
],
key
=
lambda
x
:
table_type_order
[
x
[
'type'
]])
for
sub_block
in
sorted_blocks
:
bbox
=
sub_block
[
'bbox'
]
page_block_list
.
append
(
bbox
)
layout_bbox_list
.
append
(
page_block_list
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_without_number
(
i
,
dropped_bbox_list
,
page
,
[
158
,
158
,
158
],
True
)
# draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color !
draw_bbox_without_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_without_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_without_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
# draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_without_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_without_number
(
i
,
imgs_footnote_list
,
page
,
[
255
,
178
,
102
],
True
),
draw_bbox_without_number
(
i
,
titles_list
,
page
,
[
102
,
102
,
255
],
True
)
draw_bbox_without_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_without_number
(
i
,
interequations_list
,
page
,
[
0
,
255
,
0
],
True
)
draw_bbox_without_number
(
i
,
lists_list
,
page
,
[
40
,
169
,
92
],
True
)
draw_bbox_without_number
(
i
,
indexs_list
,
page
,
[
40
,
169
,
92
],
True
)
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
],
False
,
draw_bbox
=
False
)
# Save the PDF
pdf_docs
.
save
(
f
'
{
out_path
}
/
{
filename
}
'
)
def
draw_span_bbox
(
pdf_info
,
pdf_bytes
,
out_path
,
filename
):
text_list
=
[]
inline_equation_list
=
[]
interline_equation_list
=
[]
image_list
=
[]
table_list
=
[]
dropped_list
=
[]
next_page_text_list
=
[]
next_page_inline_equation_list
=
[]
def
get_span_info
(
span
):
if
span
[
'type'
]
==
ContentType
.
Text
:
if
span
.
get
(
CROSS_PAGE
,
False
):
next_page_text_list
.
append
(
span
[
'bbox'
])
else
:
page_text_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InlineEquation
:
if
span
.
get
(
CROSS_PAGE
,
False
):
next_page_inline_equation_list
.
append
(
span
[
'bbox'
])
else
:
page_inline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
InterlineEquation
:
page_interline_equation_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Image
:
page_image_list
.
append
(
span
[
'bbox'
])
elif
span
[
'type'
]
==
ContentType
.
Table
:
page_table_list
.
append
(
span
[
'bbox'
])
for
page
in
pdf_info
:
page_text_list
=
[]
page_inline_equation_list
=
[]
page_interline_equation_list
=
[]
page_image_list
=
[]
page_table_list
=
[]
page_dropped_list
=
[]
# 将跨页的span放到移动到下一页的列表中
if
len
(
next_page_text_list
)
>
0
:
page_text_list
.
extend
(
next_page_text_list
)
next_page_text_list
.
clear
()
if
len
(
next_page_inline_equation_list
)
>
0
:
page_inline_equation_list
.
extend
(
next_page_inline_equation_list
)
next_page_inline_equation_list
.
clear
()
# 构造dropped_list
for
block
in
page
[
'discarded_blocks'
]:
if
block
[
'type'
]
==
BlockType
.
Discarded
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
page_dropped_list
.
append
(
span
[
'bbox'
])
dropped_list
.
append
(
page_dropped_list
)
# 构造其余useful_list
# for block in page['para_blocks']: # span直接用分段合并前的结果就可以
for
block
in
page
[
'preproc_blocks'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
,
BlockType
.
List
,
BlockType
.
Index
,
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
get_span_info
(
span
)
elif
block
[
'type'
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
for
sub_block
in
block
[
'blocks'
]:
for
line
in
sub_block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
get_span_info
(
span
)
text_list
.
append
(
page_text_list
)
inline_equation_list
.
append
(
page_inline_equation_list
)
interline_equation_list
.
append
(
page_interline_equation_list
)
image_list
.
append
(
page_image_list
)
table_list
.
append
(
page_table_list
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
# 获取当前页面的数据
draw_bbox_without_number
(
i
,
text_list
,
page
,
[
255
,
0
,
0
],
False
)
draw_bbox_without_number
(
i
,
inline_equation_list
,
page
,
[
0
,
255
,
0
],
False
)
draw_bbox_without_number
(
i
,
interline_equation_list
,
page
,
[
0
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
image_list
,
page
,
[
255
,
204
,
0
],
False
)
draw_bbox_without_number
(
i
,
table_list
,
page
,
[
204
,
0
,
255
],
False
)
draw_bbox_without_number
(
i
,
dropped_list
,
page
,
[
158
,
158
,
158
],
False
)
# Save the PDF
pdf_docs
.
save
(
f
'
{
out_path
}
/
{
filename
}
'
)
def
draw_model_bbox
(
model_list
,
dataset
:
Dataset
,
out_path
,
filename
):
dropped_bbox_list
=
[]
tables_body_list
,
tables_caption_list
,
tables_footnote_list
=
[],
[],
[]
imgs_body_list
,
imgs_caption_list
,
imgs_footnote_list
=
[],
[],
[]
titles_list
=
[]
texts_list
=
[]
interequations_list
=
[]
magic_model
=
MagicModel
(
model_list
,
dataset
)
for
i
in
range
(
len
(
model_list
)):
page_dropped_list
=
[]
tables_body
,
tables_caption
,
tables_footnote
=
[],
[],
[]
imgs_body
,
imgs_caption
,
imgs_footnote
=
[],
[],
[]
titles
=
[]
texts
=
[]
interequations
=
[]
page_info
=
magic_model
.
get_model_list
(
i
)
layout_dets
=
page_info
[
'layout_dets'
]
for
layout_det
in
layout_dets
:
bbox
=
layout_det
[
'bbox'
]
if
layout_det
[
'category_id'
]
==
CategoryId
.
Text
:
texts
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
Title
:
titles
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
TableBody
:
tables_body
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
TableCaption
:
tables_caption
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
TableFootnote
:
tables_footnote
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
ImageBody
:
imgs_body
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
ImageCaption
:
imgs_caption
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
InterlineEquation_YOLO
:
interequations
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
Abandon
:
page_dropped_list
.
append
(
bbox
)
elif
layout_det
[
'category_id'
]
==
CategoryId
.
ImageFootnote
:
imgs_footnote
.
append
(
bbox
)
tables_body_list
.
append
(
tables_body
)
tables_caption_list
.
append
(
tables_caption
)
tables_footnote_list
.
append
(
tables_footnote
)
imgs_body_list
.
append
(
imgs_body
)
imgs_caption_list
.
append
(
imgs_caption
)
titles_list
.
append
(
titles
)
texts_list
.
append
(
texts
)
interequations_list
.
append
(
interequations
)
dropped_bbox_list
.
append
(
page_dropped_list
)
imgs_footnote_list
.
append
(
imgs_footnote
)
for
i
in
range
(
len
(
dataset
)):
page
=
dataset
.
get_page
(
i
)
draw_bbox_with_number
(
i
,
dropped_bbox_list
,
page
,
[
158
,
158
,
158
],
True
)
# color !
draw_bbox_with_number
(
i
,
tables_body_list
,
page
,
[
204
,
204
,
0
],
True
)
draw_bbox_with_number
(
i
,
tables_caption_list
,
page
,
[
255
,
255
,
102
],
True
)
draw_bbox_with_number
(
i
,
tables_footnote_list
,
page
,
[
229
,
255
,
204
],
True
)
draw_bbox_with_number
(
i
,
imgs_body_list
,
page
,
[
153
,
255
,
51
],
True
)
draw_bbox_with_number
(
i
,
imgs_caption_list
,
page
,
[
102
,
178
,
255
],
True
)
draw_bbox_with_number
(
i
,
imgs_footnote_list
,
page
,
[
255
,
178
,
102
],
True
)
draw_bbox_with_number
(
i
,
titles_list
,
page
,
[
102
,
102
,
255
],
True
)
draw_bbox_with_number
(
i
,
texts_list
,
page
,
[
153
,
0
,
76
],
True
)
draw_bbox_with_number
(
i
,
interequations_list
,
page
,
[
0
,
255
,
0
],
True
)
# Save the PDF
dataset
.
dump_to_file
(
f
'
{
out_path
}
/
{
filename
}
'
)
def
draw_line_sort_bbox
(
pdf_info
,
pdf_bytes
,
out_path
,
filename
):
layout_bbox_list
=
[]
for
page
in
pdf_info
:
page_line_list
=
[]
for
block
in
page
[
'preproc_blocks'
]:
if
block
[
'type'
]
in
[
BlockType
.
Text
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
elif
block
[
'type'
]
in
[
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
if
'virtual_lines'
in
block
:
if
len
(
block
[
'virtual_lines'
])
>
0
and
block
[
'virtual_lines'
][
0
].
get
(
'index'
,
None
)
is
not
None
:
for
line
in
block
[
'virtual_lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
else
:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
elif
block
[
'type'
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
for
sub_block
in
block
[
'blocks'
]:
if
sub_block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
if
len
(
sub_block
[
'virtual_lines'
])
>
0
and
sub_block
[
'virtual_lines'
][
0
].
get
(
'index'
,
None
)
is
not
None
:
for
line
in
sub_block
[
'virtual_lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
else
:
for
line
in
sub_block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
elif
sub_block
[
'type'
]
in
[
BlockType
.
ImageCaption
,
BlockType
.
TableCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableFootnote
]:
for
line
in
sub_block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
index
=
line
[
'index'
]
page_line_list
.
append
({
'index'
:
index
,
'bbox'
:
bbox
})
sorted_bboxes
=
sorted
(
page_line_list
,
key
=
lambda
x
:
x
[
'index'
])
layout_bbox_list
.
append
(
sorted_bbox
[
'bbox'
]
for
sorted_bbox
in
sorted_bboxes
)
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
draw_bbox_with_number
(
i
,
layout_bbox_list
,
page
,
[
255
,
0
,
0
],
False
)
pdf_docs
.
save
(
f
'
{
out_path
}
/
{
filename
}
'
)
def
draw_char_bbox
(
pdf_bytes
,
out_path
,
filename
):
pdf_docs
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
for
i
,
page
in
enumerate
(
pdf_docs
):
for
block
in
page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXT_PRESERVE_LIGATURES
|
fitz
.
TEXT_PRESERVE_WHITESPACE
|
fitz
.
TEXT_MEDIABOX_CLIP
)[
'blocks'
]:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
for
char
in
span
[
'chars'
]:
char_bbox
=
char
[
'bbox'
]
page
.
draw_rect
(
char_bbox
,
color
=
[
1
,
0
,
0
],
fill
=
None
,
fill_opacity
=
1
,
width
=
0.3
,
overlay
=
True
,)
pdf_docs
.
save
(
f
'
{
out_path
}
/
{
filename
}
'
)
magic_pdf/libs/json_compressor.py
deleted
100644 → 0
View file @
3bd0ecf1
import
json
import
brotli
import
base64
class
JsonCompressor
:
@
staticmethod
def
compress_json
(
data
):
"""
Compress a json object and encode it with base64
"""
json_str
=
json
.
dumps
(
data
)
json_bytes
=
json_str
.
encode
(
'utf-8'
)
compressed
=
brotli
.
compress
(
json_bytes
,
quality
=
6
)
compressed_str
=
base64
.
b64encode
(
compressed
).
decode
(
'utf-8'
)
# convert bytes to string
return
compressed_str
@
staticmethod
def
decompress_json
(
compressed_str
):
"""
Decode the base64 string and decompress the json object
"""
compressed
=
base64
.
b64decode
(
compressed_str
.
encode
(
'utf-8'
))
# convert string to bytes
decompressed_bytes
=
brotli
.
decompress
(
compressed
)
json_str
=
decompressed_bytes
.
decode
(
'utf-8'
)
data
=
json
.
loads
(
json_str
)
return
data
magic_pdf/libs/local_math.py
deleted
100644 → 0
View file @
3bd0ecf1
def
float_gt
(
a
,
b
):
if
0.0001
>=
abs
(
a
-
b
):
return
False
return
a
>
b
def
float_equal
(
a
,
b
):
if
0.0001
>=
abs
(
a
-
b
):
return
True
return
False
\ No newline at end of file
magic_pdf/libs/markdown_utils.py
deleted
100644 → 0
View file @
3bd0ecf1
def
ocr_escape_special_markdown_char
(
content
):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars
=
[
"*"
,
"`"
,
"~"
,
"$"
]
for
char
in
special_chars
:
content
=
content
.
replace
(
char
,
"
\\
"
+
char
)
return
content
magic_pdf/libs/pdf_check.py
deleted
100644 → 0
View file @
3bd0ecf1
import
fitz
import
numpy
as
np
from
loguru
import
logger
import
re
from
io
import
BytesIO
from
pdfminer.high_level
import
extract_text
from
pdfminer.layout
import
LAParams
def
calculate_sample_count
(
total_page
:
int
):
"""
根据总页数和采样率计算采样页面的数量。
"""
select_page_cnt
=
min
(
10
,
total_page
)
return
select_page_cnt
def
extract_pages
(
src_pdf_bytes
:
bytes
)
->
fitz
.
Document
:
pdf_docs
=
fitz
.
open
(
"pdf"
,
src_pdf_bytes
)
total_page
=
len
(
pdf_docs
)
if
total_page
==
0
:
# 如果PDF没有页面,直接返回空文档
logger
.
warning
(
"PDF is empty, return empty document"
)
return
fitz
.
Document
()
select_page_cnt
=
calculate_sample_count
(
total_page
)
page_num
=
np
.
random
.
choice
(
total_page
,
select_page_cnt
,
replace
=
False
)
sample_docs
=
fitz
.
Document
()
try
:
for
index
in
page_num
:
sample_docs
.
insert_pdf
(
pdf_docs
,
from_page
=
int
(
index
),
to_page
=
int
(
index
))
except
Exception
as
e
:
logger
.
exception
(
e
)
return
sample_docs
def
detect_invalid_chars
(
src_pdf_bytes
:
bytes
)
->
bool
:
""""
检测PDF中是否包含非法字符
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
sample_docs
=
extract_pages
(
src_pdf_bytes
)
sample_pdf_bytes
=
sample_docs
.
tobytes
()
sample_pdf_file_like_object
=
BytesIO
(
sample_pdf_bytes
)
laparams
=
LAParams
(
line_overlap
=
0.5
,
char_margin
=
2.0
,
line_margin
=
0.5
,
word_margin
=
0.1
,
boxes_flow
=
None
,
detect_vertical
=
False
,
all_texts
=
False
,
)
text
=
extract_text
(
pdf_file
=
sample_pdf_file_like_object
,
laparams
=
laparams
)
text
=
text
.
replace
(
"
\n
"
,
""
)
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern
=
re
.
compile
(
r
'\(cid:\d+\)'
)
matches
=
cid_pattern
.
findall
(
text
)
cid_count
=
len
(
matches
)
cid_len
=
sum
(
len
(
match
)
for
match
in
matches
)
text_len
=
len
(
text
)
if
text_len
==
0
:
cid_chars_radio
=
0
else
:
cid_chars_radio
=
cid_count
/
(
cid_count
+
text_len
-
cid_len
)
logger
.
info
(
f
"cid_count:
{
cid_count
}
, text_len:
{
text_len
}
, cid_chars_radio:
{
cid_chars_radio
}
"
)
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if
cid_chars_radio
>
0.05
:
return
False
# 乱码文档
else
:
return
True
# 正常文档
def
count_replacement_characters
(
text
:
str
)
->
int
:
"""
统计字符串中 0xfffd 字符的数量。
"""
return
text
.
count
(
'
\ufffd
'
)
def
detect_invalid_chars_by_pymupdf
(
src_pdf_bytes
:
bytes
)
->
bool
:
sample_docs
=
extract_pages
(
src_pdf_bytes
)
doc_text
=
""
for
page
in
sample_docs
:
page_text
=
page
.
get_text
(
'text'
,
flags
=
fitz
.
TEXT_PRESERVE_WHITESPACE
|
fitz
.
TEXT_MEDIABOX_CLIP
)
doc_text
+=
page_text
text_len
=
len
(
doc_text
)
uffd_count
=
count_replacement_characters
(
doc_text
)
if
text_len
==
0
:
uffd_chars_radio
=
0
else
:
uffd_chars_radio
=
uffd_count
/
text_len
logger
.
info
(
f
"uffd_count:
{
uffd_count
}
, text_len:
{
text_len
}
, uffd_chars_radio:
{
uffd_chars_radio
}
"
)
'''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
if
uffd_chars_radio
>
0.01
:
return
False
# 乱码文档
else
:
return
True
# 正常文档
\ No newline at end of file
magic_pdf/libs/pdf_image_tools.py
deleted
100644 → 0
View file @
3bd0ecf1
from
io
import
BytesIO
import
cv2
import
fitz
import
numpy
as
np
from
PIL
import
Image
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.hash_utils
import
compute_sha256
def
cut_image
(
bbox
:
tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
return_path
,
imageWriter
:
DataWriter
):
"""从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
图片存放在save_path下,文件名是:
{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
# 拼接文件名
filename
=
f
'
{
page_num
}
_
{
int
(
bbox
[
0
])
}
_
{
int
(
bbox
[
1
])
}
_
{
int
(
bbox
[
2
])
}
_
{
int
(
bbox
[
3
])
}
'
# 老版本返回不带bucket的路径
img_path
=
join_path
(
return_path
,
filename
)
if
return_path
is
not
None
else
None
# 新版本生成平铺路径
img_hash256_path
=
f
'
{
compute_sha256
(
img_path
)
}
.jpg'
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
zoom
=
fitz
.
Matrix
(
3
,
3
)
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
byte_data
=
pix
.
tobytes
(
output
=
'jpeg'
,
jpg_quality
=
95
)
imageWriter
.
write
(
img_hash256_path
,
byte_data
)
return
img_hash256_path
def
cut_image_to_pil_image
(
bbox
:
tuple
,
page
:
fitz
.
Page
,
mode
=
"pillow"
):
# 将坐标转换为fitz.Rect对象
rect
=
fitz
.
Rect
(
*
bbox
)
# 配置缩放倍数为3倍
zoom
=
fitz
.
Matrix
(
3
,
3
)
# 截取图片
pix
=
page
.
get_pixmap
(
clip
=
rect
,
matrix
=
zoom
)
if
mode
==
"cv2"
:
# 直接转换为numpy数组供cv2使用
img_array
=
np
.
frombuffer
(
pix
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pix
.
height
,
pix
.
width
,
pix
.
n
)
# PyMuPDF使用RGB顺序,而cv2使用BGR顺序
if
pix
.
n
==
3
or
pix
.
n
==
4
:
image_result
=
cv2
.
cvtColor
(
img_array
,
cv2
.
COLOR_RGB2BGR
)
else
:
image_result
=
img_array
elif
mode
==
"pillow"
:
# 将字节数据转换为文件对象
image_file
=
BytesIO
(
pix
.
tobytes
(
output
=
'png'
))
# 使用 Pillow 打开图像
image_result
=
Image
.
open
(
image_file
)
else
:
raise
ValueError
(
f
"mode:
{
mode
}
is not supported."
)
return
image_result
\ No newline at end of file
magic_pdf/libs/performance_stats.py
deleted
100644 → 0
View file @
3bd0ecf1
import
time
import
functools
from
collections
import
defaultdict
from
typing
import
Dict
,
List
class
PerformanceStats
:
"""性能统计类,用于收集和展示方法执行时间"""
_stats
:
Dict
[
str
,
List
[
float
]]
=
defaultdict
(
list
)
@
classmethod
def
add_execution_time
(
cls
,
func_name
:
str
,
execution_time
:
float
):
"""添加执行时间记录"""
cls
.
_stats
[
func_name
].
append
(
execution_time
)
@
classmethod
def
get_stats
(
cls
)
->
Dict
[
str
,
dict
]:
"""获取统计结果"""
results
=
{}
for
func_name
,
times
in
cls
.
_stats
.
items
():
results
[
func_name
]
=
{
'count'
:
len
(
times
),
'total_time'
:
sum
(
times
),
'avg_time'
:
sum
(
times
)
/
len
(
times
),
'min_time'
:
min
(
times
),
'max_time'
:
max
(
times
)
}
return
results
@
classmethod
def
print_stats
(
cls
):
"""打印统计结果"""
stats
=
cls
.
get_stats
()
print
(
"
\n
性能统计结果:"
)
print
(
"-"
*
80
)
print
(
f
"
{
'方法名'
:
<
40
}
{
'调用次数'
:
>
8
}
{
'总时间(s)'
:
>
12
}
{
'平均时间(s)'
:
>
12
}
"
)
print
(
"-"
*
80
)
for
func_name
,
data
in
stats
.
items
():
print
(
f
"
{
func_name
:
<
40
}
{
data
[
'count'
]:
8
d
}
{
data
[
'total_time'
]:
12.6
f
}
{
data
[
'avg_time'
]:
12.6
f
}
"
)
def
measure_time
(
func
):
"""测量方法执行时间的装饰器"""
@
functools
.
wraps
(
func
)
def
wrapper
(
*
args
,
**
kwargs
):
start_time
=
time
.
time
()
result
=
func
(
*
args
,
**
kwargs
)
execution_time
=
time
.
time
()
-
start_time
# 获取更详细的函数标识
if
hasattr
(
func
,
"__self__"
):
# 实例方法
class_name
=
func
.
__self__
.
__class__
.
__name__
full_name
=
f
"
{
class_name
}
.
{
func
.
__name__
}
"
elif
hasattr
(
func
,
"__qualname__"
):
# 类方法或静态方法
full_name
=
func
.
__qualname__
else
:
module_name
=
func
.
__module__
full_name
=
f
"
{
module_name
}
.
{
func
.
__name__
}
"
PerformanceStats
.
add_execution_time
(
full_name
,
execution_time
)
return
result
return
wrapper
\ No newline at end of file
magic_pdf/libs/safe_filename.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
def
sanitize_filename
(
filename
,
replacement
=
"_"
):
if
os
.
name
==
'nt'
:
invalid_chars
=
'<>:"|?*'
for
char
in
invalid_chars
:
filename
=
filename
.
replace
(
char
,
replacement
)
return
filename
magic_pdf/libs/version.py
deleted
100644 → 0
View file @
3bd0ecf1
__version__
=
"1.3.12"
magic_pdf/model/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
__use_inside_model__
=
True
__model_mode__
=
'full'
\ No newline at end of file
magic_pdf/model/doc_analyze_by_custom_model.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
import
time
import
numpy
as
np
import
torch
os
.
environ
[
'FLAGS_npu_jit_compile'
]
=
'0'
# 关闭paddle的jit编译
os
.
environ
[
'FLAGS_use_stride_kernel'
]
=
'0'
os
.
environ
[
'PYTORCH_ENABLE_MPS_FALLBACK'
]
=
'1'
# 让mps可以fallback
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
from
loguru
import
logger
from
magic_pdf.model.sub_modules.model_utils
import
get_vram
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
import
magic_pdf.model
as
model_config
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.config_reader
import
(
get_device
,
get_formula_config
,
get_layout_config
,
get_local_models_dir
,
get_table_recog_config
)
from
magic_pdf.model.model_list
import
MODEL
class
ModelSingleton
:
_instance
=
None
_models
=
{}
def
__new__
(
cls
,
*
args
,
**
kwargs
):
if
cls
.
_instance
is
None
:
cls
.
_instance
=
super
().
__new__
(
cls
)
return
cls
.
_instance
def
get_model
(
self
,
ocr
:
bool
,
show_log
:
bool
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
key
=
(
ocr
,
show_log
,
lang
,
layout_model
,
formula_enable
,
table_enable
)
if
key
not
in
self
.
_models
:
self
.
_models
[
key
]
=
custom_model_init
(
ocr
=
ocr
,
show_log
=
show_log
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
return
self
.
_models
[
key
]
def
custom_model_init
(
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
model
=
None
if
model_config
.
__model_mode__
==
'lite'
:
logger
.
warning
(
'The Lite mode is provided for developers to conduct testing only, and the output quality is '
'not guaranteed to be reliable.'
)
model
=
MODEL
.
Paddle
elif
model_config
.
__model_mode__
==
'full'
:
model
=
MODEL
.
PEK
if
model_config
.
__use_inside_model__
:
model_init_start
=
time
.
time
()
if
model
==
MODEL
.
Paddle
:
from
magic_pdf.model.pp_structure_v2
import
CustomPaddleModel
custom_model
=
CustomPaddleModel
(
ocr
=
ocr
,
show_log
=
show_log
,
lang
=
lang
)
elif
model
==
MODEL
.
PEK
:
from
magic_pdf.model.pdf_extract_kit
import
CustomPEKModel
# 从配置文件读取model-dir和device
local_models_dir
=
get_local_models_dir
()
device
=
get_device
()
layout_config
=
get_layout_config
()
if
layout_model
is
not
None
:
layout_config
[
'model'
]
=
layout_model
formula_config
=
get_formula_config
()
if
formula_enable
is
not
None
:
formula_config
[
'enable'
]
=
formula_enable
table_config
=
get_table_recog_config
()
if
table_enable
is
not
None
:
table_config
[
'enable'
]
=
table_enable
model_input
=
{
'ocr'
:
ocr
,
'show_log'
:
show_log
,
'models_dir'
:
local_models_dir
,
'device'
:
device
,
'table_config'
:
table_config
,
'layout_config'
:
layout_config
,
'formula_config'
:
formula_config
,
'lang'
:
lang
,
}
custom_model
=
CustomPEKModel
(
**
model_input
)
else
:
logger
.
error
(
'Not allow model_name!'
)
exit
(
1
)
model_init_cost
=
time
.
time
()
-
model_init_start
logger
.
info
(
f
'model init cost:
{
model_init_cost
}
'
)
else
:
logger
.
error
(
'use_inside_model is False, not allow to use inside model'
)
exit
(
1
)
return
custom_model
def
doc_analyze
(
dataset
:
Dataset
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
dataset
)
-
1
)
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
200
))
images
=
[]
page_wh_list
=
[]
for
index
in
range
(
len
(
dataset
)):
if
start_page_id
<=
index
<=
end_page_id
:
page_data
=
dataset
.
get_page
(
index
)
img_dict
=
page_data
.
get_image
()
images
.
append
(
img_dict
[
'img'
])
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
images_with_extra_info
=
[(
images
[
index
],
ocr
,
dataset
.
_lang
)
for
index
in
range
(
len
(
images
))]
if
len
(
images
)
>=
MIN_BATCH_INFERENCE_SIZE
:
batch_size
=
MIN_BATCH_INFERENCE_SIZE
batch_images
=
[
images_with_extra_info
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images_with_extra_info
),
batch_size
)]
else
:
batch_images
=
[
images_with_extra_info
]
results
=
[]
processed_images_count
=
0
for
index
,
batch_image
in
enumerate
(
batch_images
):
processed_images_count
+=
len
(
batch_image
)
logger
.
info
(
f
'Batch
{
index
+
1
}
/
{
len
(
batch_images
)
}
:
{
processed_images_count
}
pages/
{
len
(
images_with_extra_info
)
}
pages'
)
result
=
may_batch_image_analyze
(
batch_image
,
ocr
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
model_json
=
[]
for
index
in
range
(
len
(
dataset
)):
if
start_page_id
<=
index
<=
end_page_id
:
result
=
results
.
pop
(
0
)
page_width
,
page_height
=
page_wh_list
.
pop
(
0
)
else
:
result
=
[]
page_height
=
0
page_width
=
0
page_info
=
{
'page_no'
:
index
,
'width'
:
page_width
,
'height'
:
page_height
}
page_dict
=
{
'layout_dets'
:
result
,
'page_info'
:
page_info
}
model_json
.
append
(
page_dict
)
from
magic_pdf.operators.models
import
InferenceResult
return
InferenceResult
(
model_json
,
dataset
)
def
batch_doc_analyze
(
datasets
:
list
[
Dataset
],
parse_method
:
str
=
'auto'
,
show_log
:
bool
=
False
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
MIN_BATCH_INFERENCE_SIZE
=
int
(
os
.
environ
.
get
(
'MINERU_MIN_BATCH_INFERENCE_SIZE'
,
100
))
batch_size
=
MIN_BATCH_INFERENCE_SIZE
page_wh_list
=
[]
images_with_extra_info
=
[]
for
dataset
in
datasets
:
ocr
=
False
if
parse_method
==
'auto'
:
if
dataset
.
classify
()
==
SupportedPdfParseMethod
.
TXT
:
ocr
=
False
elif
dataset
.
classify
()
==
SupportedPdfParseMethod
.
OCR
:
ocr
=
True
elif
parse_method
==
'ocr'
:
ocr
=
True
elif
parse_method
==
'txt'
:
ocr
=
False
_lang
=
dataset
.
_lang
for
index
in
range
(
len
(
dataset
)):
page_data
=
dataset
.
get_page
(
index
)
img_dict
=
page_data
.
get_image
()
page_wh_list
.
append
((
img_dict
[
'width'
],
img_dict
[
'height'
]))
images_with_extra_info
.
append
((
img_dict
[
'img'
],
ocr
,
_lang
))
batch_images
=
[
images_with_extra_info
[
i
:
i
+
batch_size
]
for
i
in
range
(
0
,
len
(
images_with_extra_info
),
batch_size
)]
results
=
[]
processed_images_count
=
0
for
index
,
batch_image
in
enumerate
(
batch_images
):
processed_images_count
+=
len
(
batch_image
)
logger
.
info
(
f
'Batch
{
index
+
1
}
/
{
len
(
batch_images
)
}
:
{
processed_images_count
}
pages/
{
len
(
images_with_extra_info
)
}
pages'
)
result
=
may_batch_image_analyze
(
batch_image
,
True
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
.
extend
(
result
)
infer_results
=
[]
from
magic_pdf.operators.models
import
InferenceResult
for
index
in
range
(
len
(
datasets
)):
dataset
=
datasets
[
index
]
model_json
=
[]
for
i
in
range
(
len
(
dataset
)):
result
=
results
.
pop
(
0
)
page_width
,
page_height
=
page_wh_list
.
pop
(
0
)
page_info
=
{
'page_no'
:
i
,
'width'
:
page_width
,
'height'
:
page_height
}
page_dict
=
{
'layout_dets'
:
result
,
'page_info'
:
page_info
}
model_json
.
append
(
page_dict
)
infer_results
.
append
(
InferenceResult
(
model_json
,
dataset
))
return
infer_results
def
may_batch_image_analyze
(
images_with_extra_info
:
list
[(
np
.
ndarray
,
bool
,
str
)],
ocr
:
bool
,
show_log
:
bool
=
False
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
):
# os.environ['CUDA_VISIBLE_DEVICES'] = str(idx)
from
magic_pdf.model.batch_analyze
import
BatchAnalyze
model_manager
=
ModelSingleton
()
# images = [image for image, _, _ in images_with_extra_info]
batch_ratio
=
1
device
=
get_device
()
if
str
(
device
).
startswith
(
'npu'
):
import
torch_npu
if
torch_npu
.
npu
.
is_available
():
torch
.
npu
.
set_compile_mode
(
jit_compile
=
False
)
if
str
(
device
).
startswith
(
'npu'
)
or
str
(
device
).
startswith
(
'cuda'
):
vram
=
get_vram
(
device
)
if
vram
is
not
None
:
gpu_memory
=
int
(
os
.
getenv
(
'VIRTUAL_VRAM_SIZE'
,
round
(
vram
)))
if
gpu_memory
>=
16
:
batch_ratio
=
16
elif
gpu_memory
>=
12
:
batch_ratio
=
8
elif
gpu_memory
>=
8
:
batch_ratio
=
4
elif
gpu_memory
>=
6
:
batch_ratio
=
2
else
:
batch_ratio
=
1
logger
.
info
(
f
'gpu_memory:
{
gpu_memory
}
GB, batch_ratio:
{
batch_ratio
}
'
)
else
:
# Default batch_ratio when VRAM can't be determined
batch_ratio
=
1
logger
.
info
(
f
'Could not determine GPU memory, using default batch_ratio:
{
batch_ratio
}
'
)
# doc_analyze_start = time.time()
batch_model
=
BatchAnalyze
(
model_manager
,
batch_ratio
,
show_log
,
layout_model
,
formula_enable
,
table_enable
)
results
=
batch_model
(
images_with_extra_info
)
# gc_start = time.time()
clean_memory
(
get_device
())
# gc_time = round(time.time() - gc_start, 2)
# logger.debug(f'gc time: {gc_time}')
# doc_analyze_time = round(time.time() - doc_analyze_start, 2)
# doc_analyze_speed = round(len(images) / doc_analyze_time, 2)
# logger.debug(
# f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
# f' speed: {doc_analyze_speed} pages/second'
# )
return
results
\ No newline at end of file
magic_pdf/model/pdf_extract_kit.py
deleted
100644 → 0
View file @
3bd0ecf1
# flake8: noqa
import
os
import
time
import
cv2
import
torch
import
yaml
from
loguru
import
logger
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
from
magic_pdf.config.constants
import
*
from
magic_pdf.model.model_list
import
AtomicModel
from
magic_pdf.model.sub_modules.model_init
import
AtomModelSingleton
from
magic_pdf.model.sub_modules.model_utils
import
(
clean_vram
,
crop_img
,
get_res_list_from_layout_res
)
from
magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils
import
(
get_adjusted_mfdetrec_res
,
get_ocr_result_list
)
class
CustomPEKModel
:
def
__init__
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
**
kwargs
):
"""
======== model init ========
"""
# 获取当前文件(即 pdf_extract_kit.py)的绝对路径
current_file_path
=
os
.
path
.
abspath
(
__file__
)
# 获取当前文件所在的目录(model)
current_dir
=
os
.
path
.
dirname
(
current_file_path
)
# 上一级目录(magic_pdf)
root_dir
=
os
.
path
.
dirname
(
current_dir
)
# model_config目录
model_config_dir
=
os
.
path
.
join
(
root_dir
,
'resources'
,
'model_config'
)
# 构建 model_configs.yaml 文件的完整路径
config_path
=
os
.
path
.
join
(
model_config_dir
,
'model_configs.yaml'
)
with
open
(
config_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
self
.
configs
=
yaml
.
load
(
f
,
Loader
=
yaml
.
FullLoader
)
# 初始化解析配置
# layout config
self
.
layout_config
=
kwargs
.
get
(
'layout_config'
)
self
.
layout_model_name
=
self
.
layout_config
.
get
(
'model'
,
MODEL_NAME
.
DocLayout_YOLO
)
# formula config
self
.
formula_config
=
kwargs
.
get
(
'formula_config'
)
self
.
mfd_model_name
=
self
.
formula_config
.
get
(
'mfd_model'
,
MODEL_NAME
.
YOLO_V8_MFD
)
self
.
mfr_model_name
=
self
.
formula_config
.
get
(
'mfr_model'
,
MODEL_NAME
.
UniMerNet_v2_Small
)
self
.
apply_formula
=
self
.
formula_config
.
get
(
'enable'
,
True
)
# table config
self
.
table_config
=
kwargs
.
get
(
'table_config'
)
self
.
apply_table
=
self
.
table_config
.
get
(
'enable'
,
False
)
self
.
table_max_time
=
self
.
table_config
.
get
(
'max_time'
,
TABLE_MAX_TIME_VALUE
)
self
.
table_model_name
=
self
.
table_config
.
get
(
'model'
,
MODEL_NAME
.
RAPID_TABLE
)
self
.
table_sub_model_name
=
self
.
table_config
.
get
(
'sub_model'
,
None
)
# ocr config
self
.
apply_ocr
=
ocr
self
.
lang
=
kwargs
.
get
(
'lang'
,
None
)
logger
.
info
(
'DocAnalysis init, this may take some times, layout_model: {}, apply_formula: {}, apply_ocr: {}, '
'apply_table: {}, table_model: {}, lang: {}'
.
format
(
self
.
layout_model_name
,
self
.
apply_formula
,
self
.
apply_ocr
,
self
.
apply_table
,
self
.
table_model_name
,
self
.
lang
,
)
)
# 初始化解析方案
self
.
device
=
kwargs
.
get
(
'device'
,
'cpu'
)
logger
.
info
(
'using device: {}'
.
format
(
self
.
device
))
models_dir
=
kwargs
.
get
(
'models_dir'
,
os
.
path
.
join
(
root_dir
,
'resources'
,
'models'
)
)
logger
.
info
(
'using models_dir: {}'
.
format
(
models_dir
))
atom_model_manager
=
AtomModelSingleton
()
# 初始化公式识别
if
self
.
apply_formula
:
# 初始化公式检测模型
self
.
mfd_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
MFD
,
mfd_weights
=
str
(
os
.
path
.
join
(
models_dir
,
self
.
configs
[
'weights'
][
self
.
mfd_model_name
]
)
),
device
=
self
.
device
,
)
# 初始化公式解析模型
mfr_weight_dir
=
str
(
os
.
path
.
join
(
models_dir
,
self
.
configs
[
'weights'
][
self
.
mfr_model_name
])
)
mfr_cfg_path
=
str
(
os
.
path
.
join
(
model_config_dir
,
'UniMERNet'
,
'demo.yaml'
))
self
.
mfr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
MFR
,
mfr_weight_dir
=
mfr_weight_dir
,
mfr_cfg_path
=
mfr_cfg_path
,
device
=
self
.
device
,
)
# 初始化layout模型
if
self
.
layout_model_name
==
MODEL_NAME
.
LAYOUTLMv3
:
self
.
layout_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
Layout
,
layout_model_name
=
MODEL_NAME
.
LAYOUTLMv3
,
layout_weights
=
str
(
os
.
path
.
join
(
models_dir
,
self
.
configs
[
'weights'
][
self
.
layout_model_name
]
)
),
layout_config_file
=
str
(
os
.
path
.
join
(
model_config_dir
,
'layoutlmv3'
,
'layoutlmv3_base_inference.yaml'
)
),
device
=
'cpu'
if
str
(
self
.
device
).
startswith
(
"mps"
)
else
self
.
device
,
)
elif
self
.
layout_model_name
==
MODEL_NAME
.
DocLayout_YOLO
:
self
.
layout_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
Layout
,
layout_model_name
=
MODEL_NAME
.
DocLayout_YOLO
,
doclayout_yolo_weights
=
str
(
os
.
path
.
join
(
models_dir
,
self
.
configs
[
'weights'
][
self
.
layout_model_name
]
)
),
device
=
self
.
device
,
)
# 初始化ocr
self
.
ocr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
OCR
,
ocr_show_log
=
show_log
,
det_db_box_thresh
=
0.3
,
lang
=
self
.
lang
)
# init table model
if
self
.
apply_table
:
table_model_dir
=
self
.
configs
[
'weights'
][
self
.
table_model_name
]
self
.
table_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
AtomicModel
.
Table
,
table_model_name
=
self
.
table_model_name
,
table_model_path
=
str
(
os
.
path
.
join
(
models_dir
,
table_model_dir
)),
table_max_time
=
self
.
table_max_time
,
device
=
self
.
device
,
ocr_engine
=
self
.
ocr_model
,
table_sub_model_name
=
self
.
table_sub_model_name
)
logger
.
info
(
'DocAnalysis init done!'
)
def
__call__
(
self
,
image
):
# layout检测
layout_start
=
time
.
time
()
layout_res
=
[]
if
self
.
layout_model_name
==
MODEL_NAME
.
LAYOUTLMv3
:
# layoutlmv3
layout_res
=
self
.
layout_model
(
image
,
ignore_catids
=
[])
elif
self
.
layout_model_name
==
MODEL_NAME
.
DocLayout_YOLO
:
layout_res
=
self
.
layout_model
.
predict
(
image
)
layout_cost
=
round
(
time
.
time
()
-
layout_start
,
2
)
logger
.
info
(
f
'layout detection time:
{
layout_cost
}
'
)
if
self
.
apply_formula
:
# 公式检测
mfd_start
=
time
.
time
()
mfd_res
=
self
.
mfd_model
.
predict
(
image
)
logger
.
info
(
f
'mfd time:
{
round
(
time
.
time
()
-
mfd_start
,
2
)
}
'
)
# 公式识别
mfr_start
=
time
.
time
()
formula_list
=
self
.
mfr_model
.
predict
(
mfd_res
,
image
)
layout_res
.
extend
(
formula_list
)
mfr_cost
=
round
(
time
.
time
()
-
mfr_start
,
2
)
logger
.
info
(
f
'formula nums:
{
len
(
formula_list
)
}
, mfr time:
{
mfr_cost
}
'
)
# 清理显存
clean_vram
(
self
.
device
,
vram_threshold
=
6
)
# 从layout_res中获取ocr区域、表格区域、公式区域
ocr_res_list
,
table_res_list
,
single_page_mfdetrec_res
=
(
get_res_list_from_layout_res
(
layout_res
)
)
# ocr识别
ocr_start
=
time
.
time
()
# Process each area that requires OCR processing
for
res
in
ocr_res_list
:
new_image
,
useful_list
=
crop_img
(
res
,
image
,
crop_paste_x
=
50
,
crop_paste_y
=
50
)
adjusted_mfdetrec_res
=
get_adjusted_mfdetrec_res
(
single_page_mfdetrec_res
,
useful_list
)
# OCR recognition
new_image
=
cv2
.
cvtColor
(
new_image
,
cv2
.
COLOR_RGB2BGR
)
if
self
.
apply_ocr
:
ocr_res
=
self
.
ocr_model
.
ocr
(
new_image
,
mfd_res
=
adjusted_mfdetrec_res
)[
0
]
else
:
ocr_res
=
self
.
ocr_model
.
ocr
(
new_image
,
mfd_res
=
adjusted_mfdetrec_res
,
rec
=
False
)[
0
]
# Integration results
if
ocr_res
:
ocr_result_list
=
get_ocr_result_list
(
ocr_res
,
useful_list
)
layout_res
.
extend
(
ocr_result_list
)
ocr_cost
=
round
(
time
.
time
()
-
ocr_start
,
2
)
if
self
.
apply_ocr
:
logger
.
info
(
f
"ocr time:
{
ocr_cost
}
"
)
else
:
logger
.
info
(
f
"det time:
{
ocr_cost
}
"
)
# 表格识别 table recognition
if
self
.
apply_table
:
table_start
=
time
.
time
()
for
res
in
table_res_list
:
new_image
,
_
=
crop_img
(
res
,
image
)
single_table_start_time
=
time
.
time
()
html_code
=
None
if
self
.
table_model_name
==
MODEL_NAME
.
STRUCT_EQTABLE
:
with
torch
.
no_grad
():
table_result
=
self
.
table_model
.
predict
(
new_image
,
'html'
)
if
len
(
table_result
)
>
0
:
html_code
=
table_result
[
0
]
elif
self
.
table_model_name
==
MODEL_NAME
.
TABLE_MASTER
:
html_code
=
self
.
table_model
.
img2html
(
new_image
)
elif
self
.
table_model_name
==
MODEL_NAME
.
RAPID_TABLE
:
html_code
,
table_cell_bboxes
,
logic_points
,
elapse
=
self
.
table_model
.
predict
(
new_image
)
run_time
=
time
.
time
()
-
single_table_start_time
if
run_time
>
self
.
table_max_time
:
logger
.
warning
(
f
'table recognition processing exceeds max time
{
self
.
table_max_time
}
s'
)
# 判断是否返回正常
if
html_code
:
expected_ending
=
html_code
.
strip
().
endswith
(
'</html>'
)
or
html_code
.
strip
().
endswith
(
'</table>'
)
if
expected_ending
:
res
[
'html'
]
=
html_code
else
:
logger
.
warning
(
'table recognition processing fails, not found expected HTML table end'
)
else
:
logger
.
warning
(
'table recognition processing fails, not get html return'
)
logger
.
info
(
f
'table time:
{
round
(
time
.
time
()
-
table_start
,
2
)
}
'
)
return
layout_res
magic_pdf/model/pp_structure_v2.py
deleted
100644 → 0
View file @
3bd0ecf1
import
random
from
loguru
import
logger
try
:
from
paddleocr
import
PPStructure
except
ImportError
:
logger
.
error
(
'paddleocr not installed, please install by "pip install magic-pdf[lite]"'
)
exit
(
1
)
def
region_to_bbox
(
region
):
x0
=
region
[
0
][
0
]
y0
=
region
[
0
][
1
]
x1
=
region
[
2
][
0
]
y1
=
region
[
2
][
1
]
return
[
x0
,
y0
,
x1
,
y1
]
class
CustomPaddleModel
:
def
__init__
(
self
,
ocr
:
bool
=
False
,
show_log
:
bool
=
False
,
lang
=
None
,
det_db_box_thresh
=
0.3
,
use_dilation
=
True
,
det_db_unclip_ratio
=
1.8
):
if
lang
is
not
None
:
self
.
model
=
PPStructure
(
table
=
False
,
ocr
=
True
,
show_log
=
show_log
,
lang
=
lang
,
det_db_box_thresh
=
det_db_box_thresh
,
use_dilation
=
use_dilation
,
det_db_unclip_ratio
=
det_db_unclip_ratio
,
)
else
:
self
.
model
=
PPStructure
(
table
=
False
,
ocr
=
True
,
show_log
=
show_log
,
det_db_box_thresh
=
det_db_box_thresh
,
use_dilation
=
use_dilation
,
det_db_unclip_ratio
=
det_db_unclip_ratio
,
)
def
__call__
(
self
,
img
):
try
:
import
cv2
except
ImportError
:
logger
.
error
(
"opencv-python not installed, please install by pip."
)
exit
(
1
)
# 将RGB图片转换为BGR格式适配paddle
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_RGB2BGR
)
result
=
self
.
model
(
img
)
spans
=
[]
for
line
in
result
:
line
.
pop
(
"img"
)
"""
为paddle输出适配type no.
title: 0 # 标题
text: 1 # 文本
header: 2 # abandon
footer: 2 # abandon
reference: 1 # 文本 or abandon
equation: 8 # 行间公式 block
equation: 14 # 行间公式 text
figure: 3 # 图片
figure_caption: 4 # 图片描述
table: 5 # 表格
table_caption: 6 # 表格描述
"""
if
line
[
"type"
]
==
"title"
:
line
[
"category_id"
]
=
0
elif
line
[
"type"
]
in
[
"text"
,
"reference"
]:
line
[
"category_id"
]
=
1
elif
line
[
"type"
]
==
"figure"
:
line
[
"category_id"
]
=
3
elif
line
[
"type"
]
==
"figure_caption"
:
line
[
"category_id"
]
=
4
elif
line
[
"type"
]
==
"table"
:
line
[
"category_id"
]
=
5
elif
line
[
"type"
]
==
"table_caption"
:
line
[
"category_id"
]
=
6
elif
line
[
"type"
]
==
"equation"
:
line
[
"category_id"
]
=
8
elif
line
[
"type"
]
in
[
"header"
,
"footer"
]:
line
[
"category_id"
]
=
2
else
:
logger
.
warning
(
f
"unknown type:
{
line
[
'type'
]
}
"
)
# 兼容不输出score的paddleocr版本
if
line
.
get
(
"score"
)
is
None
:
line
[
"score"
]
=
0.5
+
random
.
random
()
*
0.5
res
=
line
.
pop
(
"res"
,
None
)
if
res
is
not
None
and
len
(
res
)
>
0
:
for
span
in
res
:
new_span
=
{
"category_id"
:
15
,
"bbox"
:
region_to_bbox
(
span
[
"text_region"
]),
"score"
:
span
[
"confidence"
],
"text"
:
span
[
"text"
],
}
spans
.
append
(
new_span
)
if
len
(
spans
)
>
0
:
result
.
extend
(
spans
)
return
result
Prev
1
2
3
4
5
6
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment