Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0c7a0882
Unverified
Commit
0c7a0882
authored
Jun 12, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jun 12, 2025
Browse files
Merge pull request #2611 from myhloli/dev
Dev
parents
3bd0ecf1
a392f445
Changes
262
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
29 additions
and
2353 deletions
+29
-2353
magic_pdf/operators/pipes.py
magic_pdf/operators/pipes.py
+0
-191
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+0
-1049
magic_pdf/pre_proc/__init__.py
magic_pdf/pre_proc/__init__.py
+0
-0
magic_pdf/pre_proc/construct_page_dict.py
magic_pdf/pre_proc/construct_page_dict.py
+0
-17
magic_pdf/pre_proc/cut_image.py
magic_pdf/pre_proc/cut_image.py
+0
-32
magic_pdf/pre_proc/ocr_span_list_modify.py
magic_pdf/pre_proc/ocr_span_list_modify.py
+0
-131
magic_pdf/pre_proc/remove_bbox_overlap.py
magic_pdf/pre_proc/remove_bbox_overlap.py
+0
-100
magic_pdf/resources/model_config/model_configs.yaml
magic_pdf/resources/model_config/model_configs.yaml
+0
-8
magic_pdf/resources/slanet_plus/slanet-plus.onnx
magic_pdf/resources/slanet_plus/slanet-plus.onnx
+0
-0
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
+0
-0
magic_pdf/spark/__init__.py
magic_pdf/spark/__init__.py
+0
-0
magic_pdf/spark/spark_api.py
magic_pdf/spark/spark_api.py
+0
-49
magic_pdf/tools/__init__.py
magic_pdf/tools/__init__.py
+0
-0
magic_pdf/tools/cli.py
magic_pdf/tools/cli.py
+0
-161
magic_pdf/tools/cli_dev.py
magic_pdf/tools/cli_dev.py
+0
-149
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+0
-340
magic_pdf/utils/__init__.py
magic_pdf/utils/__init__.py
+0
-0
magic_pdf/utils/annotations.py
magic_pdf/utils/annotations.py
+0
-11
magic_pdf/utils/office_to_pdf.py
magic_pdf/utils/office_to_pdf.py
+0
-115
mineru.template.json
mineru.template.json
+29
-0
No files found.
magic_pdf/operators/pipes.py
deleted
100644 → 0
View file @
3bd0ecf1
import
copy
import
json
import
os
from
typing
import
Callable
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.dict2md.ocr_mkcontent
import
union_make
from
magic_pdf.libs.draw_bbox
import
(
draw_layout_bbox
,
draw_line_sort_bbox
,
draw_span_bbox
)
from
magic_pdf.libs.json_compressor
import
JsonCompressor
class
PipeResult
:
def
__init__
(
self
,
pipe_res
,
dataset
:
Dataset
):
"""Initialized.
Args:
pipe_res (list[dict]): the pipeline processed result of model inference result
dataset (Dataset): the dataset associated with pipe_res
"""
self
.
_pipe_res
=
pipe_res
self
.
_dataset
=
dataset
def
get_markdown
(
self
,
img_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
MakeMode
.
MM_MD
,
)
->
str
:
"""Get markdown content.
Args:
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
Returns:
str: return markdown content
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
md_content
=
union_make
(
pdf_info_list
,
md_make_mode
,
drop_mode
,
img_dir_or_bucket_prefix
)
return
md_content
def
dump_md
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
img_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
MakeMode
.
MM_MD
,
):
"""Dump The Markdown.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
"""
md_content
=
self
.
get_markdown
(
img_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
md_make_mode
=
md_make_mode
)
writer
.
write_string
(
file_path
,
md_content
)
def
get_content_list
(
self
,
image_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
)
->
str
:
"""Get Content List.
Args:
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
Returns:
str: content list content
"""
pdf_info_list
=
self
.
_pipe_res
[
'pdf_info'
]
content_list
=
union_make
(
pdf_info_list
,
MakeMode
.
STANDARD_FORMAT
,
drop_mode
,
image_dir_or_bucket_prefix
,
)
return
content_list
def
dump_content_list
(
self
,
writer
:
DataWriter
,
file_path
:
str
,
image_dir_or_bucket_prefix
:
str
,
drop_mode
=
DropMode
.
NONE
,
):
"""Dump Content List.
Args:
writer (DataWriter): File writer handle
file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
"""
content_list
=
self
.
get_content_list
(
image_dir_or_bucket_prefix
,
drop_mode
=
drop_mode
,
)
writer
.
write_string
(
file_path
,
json
.
dumps
(
content_list
,
ensure_ascii
=
False
,
indent
=
4
)
)
def
get_middle_json
(
self
)
->
str
:
"""Get middle json.
Returns:
str: The content of middle json
"""
return
json
.
dumps
(
self
.
_pipe_res
,
ensure_ascii
=
False
,
indent
=
4
)
def
dump_middle_json
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
"""Dump the result of pipeline.
Args:
writer (DataWriter): File writer handler
file_path (str): The file location of middle json
"""
middle_json
=
self
.
get_middle_json
()
writer
.
write_string
(
file_path
,
middle_json
)
def
draw_layout
(
self
,
file_path
:
str
)
->
None
:
"""Draw the layout.
Args:
file_path (str): The file location of layout result file
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
pdf_info
=
self
.
_pipe_res
[
'pdf_info'
]
draw_layout_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
def
draw_span
(
self
,
file_path
:
str
):
"""Draw the Span.
Args:
file_path (str): The file location of span result file
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
pdf_info
=
self
.
_pipe_res
[
'pdf_info'
]
draw_span_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
def
draw_line_sort
(
self
,
file_path
:
str
):
"""Draw line sort.
Args:
file_path (str): The file location of line sort result file
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
base_name
=
os
.
path
.
basename
(
file_path
)
if
not
os
.
path
.
exists
(
dir_name
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
pdf_info
=
self
.
_pipe_res
[
'pdf_info'
]
draw_line_sort_bbox
(
pdf_info
,
self
.
_dataset
.
data_bits
(),
dir_name
,
base_name
)
def
get_compress_pdf_mid_data
(
self
):
"""Compress the pipeline result.
Returns:
str: compress the pipeline result and return
"""
return
JsonCompressor
.
compress_json
(
self
.
_pipe_res
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
copy
.
deepcopy
(
self
.
_pipe_res
),
*
args
,
**
kwargs
)
magic_pdf/pdf_parse_union_core_v2.py
deleted
100644 → 0
View file @
3bd0ecf1
import
copy
import
math
import
os
import
re
import
statistics
import
time
import
warnings
from
typing
import
List
import
cv2
import
fitz
import
torch
import
numpy
as
np
from
loguru
import
logger
from
tqdm
import
tqdm
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.ocr_content_type
import
BlockType
,
ContentType
from
magic_pdf.data.dataset
import
Dataset
,
PageableData
from
magic_pdf.libs.boxbase
import
calculate_overlap_area_in_bbox1_area_ratio
,
__is_overlaps_y_exceeds_threshold
from
magic_pdf.libs.clean_memory
import
clean_memory
from
magic_pdf.libs.config_reader
import
get_local_layoutreader_model_dir
,
get_llm_aided_config
,
get_device
from
magic_pdf.libs.convert_utils
import
dict_to_list
from
magic_pdf.libs.hash_utils
import
compute_md5
from
magic_pdf.libs.pdf_image_tools
import
cut_image_to_pil_image
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.post_proc.llm_aided
import
llm_aided_formula
,
llm_aided_text
,
llm_aided_title
from
magic_pdf.model.sub_modules.model_init
import
AtomModelSingleton
from
magic_pdf.post_proc.para_split_v3
import
para_split
from
magic_pdf.pre_proc.construct_page_dict
import
ocr_construct_page_component_v2
from
magic_pdf.pre_proc.cut_image
import
ocr_cut_image_and_table
from
magic_pdf.pre_proc.ocr_detect_all_bboxes
import
ocr_prepare_bboxes_for_layout_split_v2
from
magic_pdf.pre_proc.ocr_dict_merge
import
fill_spans_in_blocks
,
fix_block_spans_v2
,
fix_discarded_block
from
magic_pdf.pre_proc.ocr_span_list_modify
import
get_qa_need_list_v2
,
remove_overlaps_low_confidence_spans
,
\
remove_overlaps_min_spans
,
remove_x_overlapping_chars
os
.
environ
[
'NO_ALBUMENTATIONS_UPDATE'
]
=
'1'
# 禁止albumentations检查更新
def
__replace_STX_ETX
(
text_str
:
str
):
"""Replace
\u0002
and
\u0003
, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
Args:
text_str (str): raw text
Returns:
_type_: replaced text
"""
# noqa: E501
if
text_str
:
s
=
text_str
.
replace
(
'
\u0002
'
,
"'"
)
s
=
s
.
replace
(
'
\u0003
'
,
"'"
)
return
s
return
text_str
# 连写字符拆分
def
__replace_ligatures
(
text
:
str
):
ligatures
=
{
'fi'
:
'fi'
,
'fl'
:
'fl'
,
'ff'
:
'ff'
,
'ffi'
:
'ffi'
,
'ffl'
:
'ffl'
,
'ſt'
:
'ft'
,
'st'
:
'st'
}
return
re
.
sub
(
'|'
.
join
(
map
(
re
.
escape
,
ligatures
.
keys
())),
lambda
m
:
ligatures
[
m
.
group
()],
text
)
def
chars_to_content
(
span
):
# 检查span中的char是否为空
if
len
(
span
[
'chars'
])
==
0
:
pass
else
:
# 先给chars按char['bbox']的中心点的x坐标排序
span
[
'chars'
]
=
sorted
(
span
[
'chars'
],
key
=
lambda
x
:
(
x
[
'bbox'
][
0
]
+
x
[
'bbox'
][
2
])
/
2
)
# Calculate the width of each character
char_widths
=
[
char
[
'bbox'
][
2
]
-
char
[
'bbox'
][
0
]
for
char
in
span
[
'chars'
]]
# Calculate the median width
median_width
=
statistics
.
median
(
char_widths
)
# 通过x轴重叠比率移除一部分char
span
=
remove_x_overlapping_chars
(
span
,
median_width
)
content
=
''
for
char
in
span
[
'chars'
]:
# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1
=
char
char2
=
span
[
'chars'
][
span
[
'chars'
].
index
(
char
)
+
1
]
if
span
[
'chars'
].
index
(
char
)
+
1
<
len
(
span
[
'chars'
])
else
None
if
char2
and
char2
[
'bbox'
][
0
]
-
char1
[
'bbox'
][
2
]
>
median_width
*
0.25
and
char
[
'c'
]
!=
' '
and
char2
[
'c'
]
!=
' '
:
content
+=
f
"
{
char
[
'c'
]
}
"
else
:
content
+=
char
[
'c'
]
span
[
'content'
]
=
__replace_ligatures
(
content
)
del
span
[
'chars'
]
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
def
fill_char_in_spans
(
spans
,
all_chars
):
# 简单从上到下排一下序
spans
=
sorted
(
spans
,
key
=
lambda
x
:
x
[
'bbox'
][
1
])
for
char
in
all_chars
:
for
span
in
spans
:
if
calculate_char_in_span
(
char
[
'bbox'
],
span
[
'bbox'
],
char
[
'c'
]):
span
[
'chars'
].
append
(
char
)
break
need_ocr_spans
=
[]
for
span
in
spans
:
chars_to_content
(
span
)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if
len
(
span
[
'content'
])
*
span
[
'height'
]
<
span
[
'width'
]
*
0.5
:
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
need_ocr_spans
.
append
(
span
)
del
span
[
'height'
],
span
[
'width'
]
return
need_ocr_spans
# 使用鲁棒性更强的中心点坐标判断
def
calculate_char_in_span
(
char_bbox
,
span_bbox
,
char
,
span_height_radio
=
0.33
):
char_center_x
=
(
char_bbox
[
0
]
+
char_bbox
[
2
])
/
2
char_center_y
=
(
char_bbox
[
1
]
+
char_bbox
[
3
])
/
2
span_center_y
=
(
span_bbox
[
1
]
+
span_bbox
[
3
])
/
2
span_height
=
span_bbox
[
3
]
-
span_bbox
[
1
]
if
(
span_bbox
[
0
]
<
char_center_x
<
span_bbox
[
2
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
# 字符的中轴和span的中轴高度差不能超过1/4span高度
):
return
True
else
:
# 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致)
# 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近
if
char
in
LINE_STOP_FLAG
:
if
(
(
span_bbox
[
2
]
-
span_height
)
<
char_bbox
[
0
]
<
span_bbox
[
2
]
and
char_center_x
>
span_bbox
[
0
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
):
return
True
elif
char
in
LINE_START_FLAG
:
if
(
span_bbox
[
0
]
<
char_bbox
[
2
]
<
(
span_bbox
[
0
]
+
span_height
)
and
char_center_x
<
span_bbox
[
2
]
and
span_bbox
[
1
]
<
char_center_y
<
span_bbox
[
3
]
and
abs
(
char_center_y
-
span_center_y
)
<
span_height
*
span_height_radio
):
return
True
else
:
return
False
def
remove_tilted_line
(
text_blocks
):
for
block
in
text_blocks
:
remove_lines
=
[]
for
line
in
block
[
'lines'
]:
cosine
,
sine
=
line
[
'dir'
]
# 计算弧度值
angle_radians
=
math
.
atan2
(
sine
,
cosine
)
# 将弧度值转换为角度值
angle_degrees
=
math
.
degrees
(
angle_radians
)
if
2
<
abs
(
angle_degrees
)
<
88
:
remove_lines
.
append
(
line
)
for
line
in
remove_lines
:
block
[
'lines'
].
remove
(
line
)
def
calculate_contrast
(
img
,
img_mode
)
->
float
:
"""
计算给定图像的对比度。
:param img: 图像,类型为numpy.ndarray
:Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
:return: 图像的对比度值
"""
if
img_mode
==
'rgb'
:
# 将RGB图像转换为灰度图
gray_img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_RGB2GRAY
)
elif
img_mode
==
'bgr'
:
# 将BGR图像转换为灰度图
gray_img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2GRAY
)
else
:
raise
ValueError
(
"Invalid image mode. Please provide 'rgb' or 'bgr'."
)
# 计算均值和标准差
mean_value
=
np
.
mean
(
gray_img
)
std_dev
=
np
.
std
(
gray_img
)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast
=
std_dev
/
(
mean_value
+
1e-6
)
# logger.debug(f"contrast: {contrast}")
return
round
(
contrast
,
2
)
# @measure_time
def
txt_spans_extract_v2
(
pdf_page
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
):
# cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
# cid用0xfffd表示,连字符不拆开
#text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
# 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用
text_blocks_raw
=
pdf_page
.
get_text
(
'rawdict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
# text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# 移除所有角度不为0或90的line
remove_tilted_line
(
text_blocks_raw
)
all_pymu_chars
=
[]
for
block
in
text_blocks_raw
:
for
line
in
block
[
'lines'
]:
cosine
,
sine
=
line
[
'dir'
]
if
abs
(
cosine
)
<
0.9
or
abs
(
sine
)
>
0.1
:
continue
for
span
in
line
[
'spans'
]:
all_pymu_chars
.
extend
(
span
[
'chars'
])
# 计算所有sapn的高度的中位数
span_height_list
=
[]
for
span
in
spans
:
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
continue
span_height
=
span
[
'bbox'
][
3
]
-
span
[
'bbox'
][
1
]
span
[
'height'
]
=
span_height
span
[
'width'
]
=
span
[
'bbox'
][
2
]
-
span
[
'bbox'
][
0
]
span_height_list
.
append
(
span_height
)
if
len
(
span_height_list
)
==
0
:
return
spans
else
:
median_span_height
=
statistics
.
median
(
span_height_list
)
useful_spans
=
[]
unuseful_spans
=
[]
# 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值
vertical_spans
=
[]
for
span
in
spans
:
if
span
[
'type'
]
in
[
ContentType
.
InterlineEquation
,
ContentType
.
Image
,
ContentType
.
Table
]:
continue
for
block
in
all_bboxes
+
all_discarded_blocks
:
if
block
[
7
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
InterlineEquation
]:
continue
if
calculate_overlap_area_in_bbox1_area_ratio
(
span
[
'bbox'
],
block
[
0
:
4
])
>
0.5
:
if
span
[
'height'
]
>
median_span_height
*
3
and
span
[
'height'
]
>
span
[
'width'
]
*
3
:
vertical_spans
.
append
(
span
)
elif
block
in
all_bboxes
:
useful_spans
.
append
(
span
)
else
:
unuseful_spans
.
append
(
span
)
break
"""垂直的span框直接用pymu的line进行填充"""
if
len
(
vertical_spans
)
>
0
:
text_blocks
=
pdf_page
.
get_text
(
'dict'
,
flags
=
fitz
.
TEXTFLAGS_TEXT
)[
'blocks'
]
all_pymu_lines
=
[]
for
block
in
text_blocks
:
for
line
in
block
[
'lines'
]:
all_pymu_lines
.
append
(
line
)
for
pymu_line
in
all_pymu_lines
:
for
span
in
vertical_spans
:
if
calculate_overlap_area_in_bbox1_area_ratio
(
pymu_line
[
'bbox'
],
span
[
'bbox'
])
>
0.5
:
for
pymu_span
in
pymu_line
[
'spans'
]:
span
[
'content'
]
+=
pymu_span
[
'text'
]
break
for
span
in
vertical_spans
:
if
len
(
span
[
'content'
])
==
0
:
spans
.
remove
(
span
)
"""水平的span框如果没有char则用ocr进行填充"""
new_spans
=
[]
for
span
in
useful_spans
+
unuseful_spans
:
if
span
[
'type'
]
in
[
ContentType
.
Text
]:
span
[
'chars'
]
=
[]
new_spans
.
append
(
span
)
need_ocr_spans
=
fill_char_in_spans
(
new_spans
,
all_pymu_chars
)
if
len
(
need_ocr_spans
)
>
0
:
# 初始化ocr模型
# atom_model_manager = AtomModelSingleton()
# ocr_model = atom_model_manager.get_atom_model(
# atom_model_name='ocr',
# ocr_show_log=False,
# det_db_box_thresh=0.3,
# lang=lang
# )
for
span
in
need_ocr_spans
:
# 对span的bbox截图再ocr
span_img
=
cut_image_to_pil_image
(
span
[
'bbox'
],
pdf_page
,
mode
=
'cv2'
)
# 计算span的对比度,低于0.20的span不进行ocr
if
calculate_contrast
(
span_img
,
img_mode
=
'bgr'
)
<=
0.17
:
spans
.
remove
(
span
)
continue
# pass
span
[
'content'
]
=
''
span
[
'score'
]
=
1
span
[
'np_img'
]
=
span_img
# ocr_res = ocr_model.ocr(span_img, det=False)
# if ocr_res and len(ocr_res) > 0:
# if len(ocr_res[0]) > 0:
# ocr_text, ocr_score = ocr_res[0][0]
# # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
# if ocr_score > 0.5 and len(ocr_text) > 0:
# span['content'] = ocr_text
# span['score'] = float(round(ocr_score, 2))
# else:
# spans.remove(span)
return
spans
def
model_init
(
model_name
:
str
):
from
transformers
import
LayoutLMv3ForTokenClassification
device_name
=
get_device
()
bf_16_support
=
False
if
device_name
.
startswith
(
"cuda"
):
bf_16_support
=
torch
.
cuda
.
is_bf16_supported
()
elif
device_name
.
startswith
(
"mps"
):
bf_16_support
=
True
device
=
torch
.
device
(
device_name
)
if
model_name
==
'layoutreader'
:
# 检测modelscope的缓存目录是否存在
layoutreader_model_dir
=
get_local_layoutreader_model_dir
()
if
os
.
path
.
exists
(
layoutreader_model_dir
):
model
=
LayoutLMv3ForTokenClassification
.
from_pretrained
(
layoutreader_model_dir
)
else
:
logger
.
warning
(
'local layoutreader model not exists, use online model from huggingface'
)
model
=
LayoutLMv3ForTokenClassification
.
from_pretrained
(
'hantian/layoutreader'
)
if
bf_16_support
:
model
.
to
(
device
).
eval
().
bfloat16
()
else
:
model
.
to
(
device
).
eval
()
else
:
logger
.
error
(
'model name not allow'
)
exit
(
1
)
return
model
class
ModelSingleton
:
_instance
=
None
_models
=
{}
def
__new__
(
cls
,
*
args
,
**
kwargs
):
if
cls
.
_instance
is
None
:
cls
.
_instance
=
super
().
__new__
(
cls
)
return
cls
.
_instance
def
get_model
(
self
,
model_name
:
str
):
if
model_name
not
in
self
.
_models
:
self
.
_models
[
model_name
]
=
model_init
(
model_name
=
model_name
)
return
self
.
_models
[
model_name
]
def
do_predict
(
boxes
:
List
[
List
[
int
]],
model
)
->
List
[
int
]:
from
magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers
import
(
boxes2inputs
,
parse_logits
,
prepare_inputs
)
with
warnings
.
catch_warnings
():
warnings
.
filterwarnings
(
"ignore"
,
category
=
FutureWarning
,
module
=
"transformers"
)
inputs
=
boxes2inputs
(
boxes
)
inputs
=
prepare_inputs
(
inputs
,
model
)
logits
=
model
(
**
inputs
).
logits
.
cpu
().
squeeze
(
0
)
return
parse_logits
(
logits
,
len
(
boxes
))
def
cal_block_index
(
fix_blocks
,
sorted_bboxes
):
if
sorted_bboxes
is
not
None
:
# 使用layoutreader排序
for
block
in
fix_blocks
:
line_index_list
=
[]
if
len
(
block
[
'lines'
])
==
0
:
block
[
'index'
]
=
sorted_bboxes
.
index
(
block
[
'bbox'
])
else
:
for
line
in
block
[
'lines'
]:
line
[
'index'
]
=
sorted_bboxes
.
index
(
line
[
'bbox'
])
line_index_list
.
append
(
line
[
'index'
])
median_value
=
statistics
.
median
(
line_index_list
)
block
[
'index'
]
=
median_value
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
if
'real_lines'
in
block
:
block
[
'virtual_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'lines'
]
=
copy
.
deepcopy
(
block
[
'real_lines'
])
del
block
[
'real_lines'
]
else
:
# 使用xycut排序
block_bboxes
=
[]
for
block
in
fix_blocks
:
# 如果block['bbox']任意值小于0,将其置为0
block
[
'bbox'
]
=
[
max
(
0
,
x
)
for
x
in
block
[
'bbox'
]]
block_bboxes
.
append
(
block
[
'bbox'
])
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
Title
,
BlockType
.
InterlineEquation
]:
if
'real_lines'
in
block
:
block
[
'virtual_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
block
[
'lines'
]
=
copy
.
deepcopy
(
block
[
'real_lines'
])
del
block
[
'real_lines'
]
import
numpy
as
np
from
magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut
import
\
recursive_xy_cut
random_boxes
=
np
.
array
(
block_bboxes
)
np
.
random
.
shuffle
(
random_boxes
)
res
=
[]
recursive_xy_cut
(
np
.
asarray
(
random_boxes
).
astype
(
int
),
np
.
arange
(
len
(
block_bboxes
)),
res
)
assert
len
(
res
)
==
len
(
block_bboxes
)
sorted_boxes
=
random_boxes
[
np
.
array
(
res
)].
tolist
()
for
i
,
block
in
enumerate
(
fix_blocks
):
block
[
'index'
]
=
sorted_boxes
.
index
(
block
[
'bbox'
])
# 生成line index
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
line_inedx
=
1
for
block
in
sorted_blocks
:
for
line
in
block
[
'lines'
]:
line
[
'index'
]
=
line_inedx
line_inedx
+=
1
return
fix_blocks
def
insert_lines_into_block
(
block_bbox
,
line_height
,
page_w
,
page_h
):
# block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
x0
,
y0
,
x1
,
y1
=
block_bbox
block_height
=
y1
-
y0
block_weight
=
x1
-
x0
# 如果block高度小于n行正文,则直接返回block的bbox
if
line_height
*
2
<
block_height
:
if
(
block_height
>
page_h
*
0.25
and
page_w
*
0.5
>
block_weight
>
page_w
*
0.25
):
# 可能是双列结构,可以切细点
lines
=
int
(
block_height
/
line_height
)
else
:
# 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
if
block_weight
>
page_w
*
0.4
:
lines
=
3
elif
block_weight
>
page_w
*
0.25
:
# (可能是三列结构,也切细点)
lines
=
int
(
block_height
/
line_height
)
else
:
# 判断长宽比
if
block_height
/
block_weight
>
1.2
:
# 细长的不分
return
[[
x0
,
y0
,
x1
,
y1
]]
else
:
# 不细长的还是分成两行
lines
=
2
line_height
=
(
y1
-
y0
)
/
lines
# 确定从哪个y位置开始绘制线条
current_y
=
y0
# 用于存储线条的位置信息[(x0, y), ...]
lines_positions
=
[]
for
i
in
range
(
lines
):
lines_positions
.
append
([
x0
,
current_y
,
x1
,
current_y
+
line_height
])
current_y
+=
line_height
return
lines_positions
else
:
return
[[
x0
,
y0
,
x1
,
y1
]]
def
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
):
page_line_list
=
[]
def
add_lines_to_block
(
b
):
line_bboxes
=
insert_lines_into_block
(
b
[
'bbox'
],
line_height
,
page_w
,
page_h
)
b
[
'lines'
]
=
[]
for
line_bbox
in
line_bboxes
:
b
[
'lines'
].
append
({
'bbox'
:
line_bbox
,
'spans'
:
[]})
page_line_list
.
extend
(
line_bboxes
)
for
block
in
fix_blocks
:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
if
len
(
block
[
'lines'
])
==
0
:
add_lines_to_block
(
block
)
elif
block
[
'type'
]
in
[
BlockType
.
Title
]
and
len
(
block
[
'lines'
])
==
1
and
(
block
[
'bbox'
][
3
]
-
block
[
'bbox'
][
1
])
>
line_height
*
2
:
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
add_lines_to_block
(
block
)
else
:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
page_line_list
.
append
(
bbox
)
elif
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
,
BlockType
.
InterlineEquation
]:
block
[
'real_lines'
]
=
copy
.
deepcopy
(
block
[
'lines'
])
add_lines_to_block
(
block
)
for
block
in
footnote_blocks
:
footnote_block
=
{
'bbox'
:
block
[:
4
]}
add_lines_to_block
(
footnote_block
)
if
len
(
page_line_list
)
>
200
:
# layoutreader最高支持512line
return
None
# 使用layoutreader排序
x_scale
=
1000.0
/
page_w
y_scale
=
1000.0
/
page_h
boxes
=
[]
# logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
for
left
,
top
,
right
,
bottom
in
page_line_list
:
if
left
<
0
:
logger
.
warning
(
f
'left < 0, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
left
=
0
if
right
>
page_w
:
logger
.
warning
(
f
'right > page_w, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
right
=
page_w
if
top
<
0
:
logger
.
warning
(
f
'top < 0, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
top
=
0
if
bottom
>
page_h
:
logger
.
warning
(
f
'bottom > page_h, left:
{
left
}
, right:
{
right
}
, top:
{
top
}
, bottom:
{
bottom
}
, page_w:
{
page_w
}
, page_h:
{
page_h
}
'
)
# noqa: E501
bottom
=
page_h
left
=
round
(
left
*
x_scale
)
top
=
round
(
top
*
y_scale
)
right
=
round
(
right
*
x_scale
)
bottom
=
round
(
bottom
*
y_scale
)
assert
(
1000
>=
right
>=
left
>=
0
and
1000
>=
bottom
>=
top
>=
0
),
f
'Invalid box. right:
{
right
}
, left:
{
left
}
, bottom:
{
bottom
}
, top:
{
top
}
'
# noqa: E126, E121
boxes
.
append
([
left
,
top
,
right
,
bottom
])
model_manager
=
ModelSingleton
()
model
=
model_manager
.
get_model
(
'layoutreader'
)
with
torch
.
no_grad
():
orders
=
do_predict
(
boxes
,
model
)
sorted_bboxes
=
[
page_line_list
[
i
]
for
i
in
orders
]
return
sorted_bboxes
def
get_line_height
(
blocks
):
page_line_height_list
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
in
[
BlockType
.
Text
,
BlockType
.
Title
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
for
line
in
block
[
'lines'
]:
bbox
=
line
[
'bbox'
]
page_line_height_list
.
append
(
int
(
bbox
[
3
]
-
bbox
[
1
]))
if
len
(
page_line_height_list
)
>
0
:
return
statistics
.
median
(
page_line_height_list
)
else
:
return
10
def
process_groups
(
groups
,
body_key
,
caption_key
,
footnote_key
):
body_blocks
=
[]
caption_blocks
=
[]
footnote_blocks
=
[]
for
i
,
group
in
enumerate
(
groups
):
group
[
body_key
][
'group_id'
]
=
i
body_blocks
.
append
(
group
[
body_key
])
for
caption_block
in
group
[
caption_key
]:
caption_block
[
'group_id'
]
=
i
caption_blocks
.
append
(
caption_block
)
for
footnote_block
in
group
[
footnote_key
]:
footnote_block
[
'group_id'
]
=
i
footnote_blocks
.
append
(
footnote_block
)
return
body_blocks
,
caption_blocks
,
footnote_blocks
def
process_block_list
(
blocks
,
body_type
,
block_type
):
indices
=
[
block
[
'index'
]
for
block
in
blocks
]
median_index
=
statistics
.
median
(
indices
)
body_bbox
=
next
((
block
[
'bbox'
]
for
block
in
blocks
if
block
.
get
(
'type'
)
==
body_type
),
[])
return
{
'type'
:
block_type
,
'bbox'
:
body_bbox
,
'blocks'
:
blocks
,
'index'
:
median_index
,
}
def
revert_group_blocks
(
blocks
):
image_groups
=
{}
table_groups
=
{}
new_blocks
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
in
[
BlockType
.
ImageBody
,
BlockType
.
ImageCaption
,
BlockType
.
ImageFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
image_groups
:
image_groups
[
group_id
]
=
[]
image_groups
[
group_id
].
append
(
block
)
elif
block
[
'type'
]
in
[
BlockType
.
TableBody
,
BlockType
.
TableCaption
,
BlockType
.
TableFootnote
]:
group_id
=
block
[
'group_id'
]
if
group_id
not
in
table_groups
:
table_groups
[
group_id
]
=
[]
table_groups
[
group_id
].
append
(
block
)
else
:
new_blocks
.
append
(
block
)
for
group_id
,
blocks
in
image_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
ImageBody
,
BlockType
.
Image
))
for
group_id
,
blocks
in
table_groups
.
items
():
new_blocks
.
append
(
process_block_list
(
blocks
,
BlockType
.
TableBody
,
BlockType
.
Table
))
return
new_blocks
def
remove_outside_spans
(
spans
,
all_bboxes
,
all_discarded_blocks
):
def
get_block_bboxes
(
blocks
,
block_type_list
):
return
[
block
[
0
:
4
]
for
block
in
blocks
if
block
[
7
]
in
block_type_list
]
image_bboxes
=
get_block_bboxes
(
all_bboxes
,
[
BlockType
.
ImageBody
])
table_bboxes
=
get_block_bboxes
(
all_bboxes
,
[
BlockType
.
TableBody
])
other_block_type
=
[]
for
block_type
in
BlockType
.
__dict__
.
values
():
if
not
isinstance
(
block_type
,
str
):
continue
if
block_type
not
in
[
BlockType
.
ImageBody
,
BlockType
.
TableBody
]:
other_block_type
.
append
(
block_type
)
other_block_bboxes
=
get_block_bboxes
(
all_bboxes
,
other_block_type
)
discarded_block_bboxes
=
get_block_bboxes
(
all_discarded_blocks
,
[
BlockType
.
Discarded
])
new_spans
=
[]
for
span
in
spans
:
span_bbox
=
span
[
'bbox'
]
span_type
=
span
[
'type'
]
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.4
for
block_bbox
in
discarded_block_bboxes
):
new_spans
.
append
(
span
)
continue
if
span_type
==
ContentType
.
Image
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
image_bboxes
):
new_spans
.
append
(
span
)
elif
span_type
==
ContentType
.
Table
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
table_bboxes
):
new_spans
.
append
(
span
)
else
:
if
any
(
calculate_overlap_area_in_bbox1_area_ratio
(
span_bbox
,
block_bbox
)
>
0.5
for
block_bbox
in
other_block_bboxes
):
new_spans
.
append
(
span
)
return
new_spans
def
parse_page_core
(
page_doc
:
PageableData
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
,
lang
):
need_drop
=
False
drop_reason
=
[]
"""从magic_model对象中获取后面会用到的区块信息"""
img_groups
=
magic_model
.
get_imgs_v2
(
page_id
)
table_groups
=
magic_model
.
get_tables_v2
(
page_id
)
"""对image和table的区块分组"""
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
=
process_groups
(
img_groups
,
'image_body'
,
'image_caption_list'
,
'image_footnote_list'
)
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
=
process_groups
(
table_groups
,
'table_body'
,
'table_caption_list'
,
'table_footnote_list'
)
discarded_blocks
=
magic_model
.
get_discarded
(
page_id
)
text_blocks
=
magic_model
.
get_text_blocks
(
page_id
)
title_blocks
=
magic_model
.
get_title_blocks
(
page_id
)
inline_equations
,
interline_equations
,
interline_equation_blocks
=
magic_model
.
get_equations
(
page_id
)
page_w
,
page_h
=
magic_model
.
get_page_size
(
page_id
)
def
merge_title_blocks
(
blocks
,
x_distance_threshold
=
0.1
*
page_w
):
def
merge_two_bbox
(
b1
,
b2
):
x_min
=
min
(
b1
[
'bbox'
][
0
],
b2
[
'bbox'
][
0
])
y_min
=
min
(
b1
[
'bbox'
][
1
],
b2
[
'bbox'
][
1
])
x_max
=
max
(
b1
[
'bbox'
][
2
],
b2
[
'bbox'
][
2
])
y_max
=
max
(
b1
[
'bbox'
][
3
],
b2
[
'bbox'
][
3
])
return
x_min
,
y_min
,
x_max
,
y_max
def
merge_two_blocks
(
b1
,
b2
):
# 合并两个标题块的边界框
b1
[
'bbox'
]
=
merge_two_bbox
(
b1
,
b2
)
# 合并两个标题块的文本内容
line1
=
b1
[
'lines'
][
0
]
line2
=
b2
[
'lines'
][
0
]
line1
[
'bbox'
]
=
merge_two_bbox
(
line1
,
line2
)
line1
[
'spans'
].
extend
(
line2
[
'spans'
])
return
b1
,
b2
# 按 y 轴重叠度聚集标题块
y_overlapping_blocks
=
[]
title_bs
=
[
b
for
b
in
blocks
if
b
[
'type'
]
==
BlockType
.
Title
]
while
title_bs
:
block1
=
title_bs
.
pop
(
0
)
current_row
=
[
block1
]
to_remove
=
[]
for
block2
in
title_bs
:
if
(
__is_overlaps_y_exceeds_threshold
(
block1
[
'bbox'
],
block2
[
'bbox'
],
0.9
)
and
len
(
block1
[
'lines'
])
==
1
and
len
(
block2
[
'lines'
])
==
1
):
current_row
.
append
(
block2
)
to_remove
.
append
(
block2
)
for
b
in
to_remove
:
title_bs
.
remove
(
b
)
y_overlapping_blocks
.
append
(
current_row
)
# 按x轴坐标排序并合并标题块
to_remove_blocks
=
[]
for
row
in
y_overlapping_blocks
:
if
len
(
row
)
==
1
:
continue
# 按x轴坐标排序
row
.
sort
(
key
=
lambda
x
:
x
[
'bbox'
][
0
])
merged_block
=
row
[
0
]
for
i
in
range
(
1
,
len
(
row
)):
left_block
=
merged_block
right_block
=
row
[
i
]
left_height
=
left_block
[
'bbox'
][
3
]
-
left_block
[
'bbox'
][
1
]
right_height
=
right_block
[
'bbox'
][
3
]
-
right_block
[
'bbox'
][
1
]
if
(
right_block
[
'bbox'
][
0
]
-
left_block
[
'bbox'
][
2
]
<
x_distance_threshold
and
left_height
*
0.95
<
right_height
<
left_height
*
1.05
):
merged_block
,
to_remove_block
=
merge_two_blocks
(
merged_block
,
right_block
)
to_remove_blocks
.
append
(
to_remove_block
)
else
:
merged_block
=
right_block
for
b
in
to_remove_blocks
:
blocks
.
remove
(
b
)
"""将所有区块的bbox整理到一起"""
# interline_equation_blocks参数不够准,后面切换到interline_equations上
interline_equation_blocks
=
[]
if
len
(
interline_equation_blocks
)
>
0
:
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equation_blocks
,
page_w
,
page_h
,
)
else
:
all_bboxes
,
all_discarded_blocks
,
footnote_blocks
=
ocr_prepare_bboxes_for_layout_split_v2
(
img_body_blocks
,
img_caption_blocks
,
img_footnote_blocks
,
table_body_blocks
,
table_caption_blocks
,
table_footnote_blocks
,
discarded_blocks
,
text_blocks
,
title_blocks
,
interline_equations
,
page_w
,
page_h
,
)
"""获取所有的spans信息"""
spans
=
magic_model
.
get_all_spans
(
page_id
)
"""在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span"""
"""顺便删除大水印并保留abandon的span"""
spans
=
remove_outside_spans
(
spans
,
all_bboxes
,
all_discarded_blocks
)
"""删除重叠spans中置信度较低的那些"""
spans
,
dropped_spans_by_confidence
=
remove_overlaps_low_confidence_spans
(
spans
)
"""删除重叠spans中较小的那些"""
spans
,
dropped_spans_by_span_overlap
=
remove_overlaps_min_spans
(
spans
)
"""根据parse_mode,构造spans,主要是文本类的字符填充"""
if
parse_mode
==
SupportedPdfParseMethod
.
TXT
:
"""使用新版本的混合ocr方案."""
spans
=
txt_spans_extract_v2
(
page_doc
,
spans
,
all_bboxes
,
all_discarded_blocks
,
lang
)
elif
parse_mode
==
SupportedPdfParseMethod
.
OCR
:
pass
else
:
raise
Exception
(
'parse_mode must be txt or ocr'
)
"""先处理不需要排版的discarded_blocks"""
discarded_block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_discarded_blocks
,
spans
,
0.4
)
fix_discarded_blocks
=
fix_discarded_block
(
discarded_block_with_spans
)
"""如果当前页面没有有效的bbox则跳过"""
if
len
(
all_bboxes
)
==
0
:
logger
.
warning
(
f
'skip this page, not found useful bbox, page_id:
{
page_id
}
'
)
return
ocr_construct_page_component_v2
(
[],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
interline_equations
,
fix_discarded_blocks
,
need_drop
,
drop_reason
,
)
"""对image和table截图"""
spans
=
ocr_cut_image_and_table
(
spans
,
page_doc
,
page_id
,
pdf_bytes_md5
,
imageWriter
)
"""span填充进block"""
block_with_spans
,
spans
=
fill_spans_in_blocks
(
all_bboxes
,
spans
,
0.5
)
"""对block进行fix操作"""
fix_blocks
=
fix_block_spans_v2
(
block_with_spans
)
"""同一行被断开的titile合并"""
merge_title_blocks
(
fix_blocks
)
"""获取所有line并计算正文line的高度"""
line_height
=
get_line_height
(
fix_blocks
)
"""获取所有line并对line排序"""
sorted_bboxes
=
sort_lines_by_model
(
fix_blocks
,
page_w
,
page_h
,
line_height
,
footnote_blocks
)
"""根据line的中位数算block的序列关系"""
fix_blocks
=
cal_block_index
(
fix_blocks
,
sorted_bboxes
)
"""将image和table的block还原回group形式参与后续流程"""
fix_blocks
=
revert_group_blocks
(
fix_blocks
)
"""重排block"""
sorted_blocks
=
sorted
(
fix_blocks
,
key
=
lambda
b
:
b
[
'index'
])
"""block内重排(img和table的block内多个caption或footnote的排序)"""
for
block
in
sorted_blocks
:
if
block
[
'type'
]
in
[
BlockType
.
Image
,
BlockType
.
Table
]:
block
[
'blocks'
]
=
sorted
(
block
[
'blocks'
],
key
=
lambda
b
:
b
[
'index'
])
"""获取QA需要外置的list"""
images
,
tables
,
interline_equations
=
get_qa_need_list_v2
(
sorted_blocks
)
"""构造pdf_info_dict"""
page_info
=
ocr_construct_page_component_v2
(
sorted_blocks
,
[],
page_id
,
page_w
,
page_h
,
[],
images
,
tables
,
interline_equations
,
fix_discarded_blocks
,
need_drop
,
drop_reason
,
)
return
page_info
def
pdf_parse_union
(
model_list
,
dataset
:
Dataset
,
imageWriter
,
parse_mode
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
):
pdf_bytes_md5
=
compute_md5
(
dataset
.
data_bits
())
"""初始化空的pdf_info_dict"""
pdf_info_dict
=
{}
"""用model_list和docs对象初始化magic_model"""
magic_model
=
MagicModel
(
model_list
,
dataset
)
"""根据输入的起始范围解析pdf"""
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
dataset
)
-
1
)
if
end_page_id
>
len
(
dataset
)
-
1
:
logger
.
warning
(
'end_page_id is out of range, use pdf_docs length'
)
end_page_id
=
len
(
dataset
)
-
1
# """初始化启动时间"""
# start_time = time.time()
# for page_id, page in enumerate(dataset):
for
page_id
,
page
in
tqdm
(
enumerate
(
dataset
),
total
=
len
(
dataset
),
desc
=
"Processing pages"
):
# """debug时输出每页解析的耗时."""
# if debug_mode:
# time_now = time.time()
# logger.info(
# f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
# )
# start_time = time_now
"""解析pdf中的每一页"""
if
start_page_id
<=
page_id
<=
end_page_id
:
page_info
=
parse_page_core
(
page
,
magic_model
,
page_id
,
pdf_bytes_md5
,
imageWriter
,
parse_mode
,
lang
)
else
:
page_info
=
page
.
get_page_info
()
page_w
=
page_info
.
w
page_h
=
page_info
.
h
page_info
=
ocr_construct_page_component_v2
(
[],
[],
page_id
,
page_w
,
page_h
,
[],
[],
[],
[],
[],
True
,
'skip page'
)
pdf_info_dict
[
f
'page_
{
page_id
}
'
]
=
page_info
need_ocr_list
=
[]
img_crop_list
=
[]
text_block_list
=
[]
for
pange_id
,
page_info
in
pdf_info_dict
.
items
():
for
block
in
page_info
[
'preproc_blocks'
]:
if
block
[
'type'
]
in
[
'table'
,
'image'
]:
for
sub_block
in
block
[
'blocks'
]:
if
sub_block
[
'type'
]
in
[
'image_caption'
,
'image_footnote'
,
'table_caption'
,
'table_footnote'
]:
text_block_list
.
append
(
sub_block
)
elif
block
[
'type'
]
in
[
'text'
,
'title'
]:
text_block_list
.
append
(
block
)
for
block
in
page_info
[
'discarded_blocks'
]:
text_block_list
.
append
(
block
)
for
block
in
text_block_list
:
for
line
in
block
[
'lines'
]:
for
span
in
line
[
'spans'
]:
if
'np_img'
in
span
:
need_ocr_list
.
append
(
span
)
img_crop_list
.
append
(
span
[
'np_img'
])
span
.
pop
(
'np_img'
)
if
len
(
img_crop_list
)
>
0
:
# Get OCR results for this language's images
atom_model_manager
=
AtomModelSingleton
()
ocr_model
=
atom_model_manager
.
get_atom_model
(
atom_model_name
=
'ocr'
,
ocr_show_log
=
False
,
det_db_box_thresh
=
0.3
,
lang
=
lang
)
# rec_start = time.time()
ocr_res_list
=
ocr_model
.
ocr
(
img_crop_list
,
det
=
False
,
tqdm_enable
=
True
)[
0
]
# Verify we have matching counts
assert
len
(
ocr_res_list
)
==
len
(
need_ocr_list
),
f
'ocr_res_list:
{
len
(
ocr_res_list
)
}
, need_ocr_list:
{
len
(
need_ocr_list
)
}
'
# Process OCR results for this language
for
index
,
span
in
enumerate
(
need_ocr_list
):
ocr_text
,
ocr_score
=
ocr_res_list
[
index
]
span
[
'content'
]
=
ocr_text
span
[
'score'
]
=
float
(
f
"
{
ocr_score
:.
3
f
}
"
)
# rec_time = time.time() - rec_start
# logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
"""分段"""
para_split
(
pdf_info_dict
)
"""llm优化"""
llm_aided_config
=
get_llm_aided_config
()
if
llm_aided_config
is
not
None
:
"""公式优化"""
formula_aided_config
=
llm_aided_config
.
get
(
'formula_aided'
,
None
)
if
formula_aided_config
is
not
None
:
if
formula_aided_config
.
get
(
'enable'
,
False
):
llm_aided_formula_start_time
=
time
.
time
()
llm_aided_formula
(
pdf_info_dict
,
formula_aided_config
)
logger
.
info
(
f
'llm aided formula time:
{
round
(
time
.
time
()
-
llm_aided_formula_start_time
,
2
)
}
'
)
"""文本优化"""
text_aided_config
=
llm_aided_config
.
get
(
'text_aided'
,
None
)
if
text_aided_config
is
not
None
:
if
text_aided_config
.
get
(
'enable'
,
False
):
llm_aided_text_start_time
=
time
.
time
()
llm_aided_text
(
pdf_info_dict
,
text_aided_config
)
logger
.
info
(
f
'llm aided text time:
{
round
(
time
.
time
()
-
llm_aided_text_start_time
,
2
)
}
'
)
"""标题优化"""
title_aided_config
=
llm_aided_config
.
get
(
'title_aided'
,
None
)
if
title_aided_config
is
not
None
:
if
title_aided_config
.
get
(
'enable'
,
False
):
llm_aided_title_start_time
=
time
.
time
()
llm_aided_title
(
pdf_info_dict
,
title_aided_config
)
logger
.
info
(
f
'llm aided title time:
{
round
(
time
.
time
()
-
llm_aided_title_start_time
,
2
)
}
'
)
"""dict转list"""
pdf_info_list
=
dict_to_list
(
pdf_info_dict
)
new_pdf_info_dict
=
{
'pdf_info'
:
pdf_info_list
,
}
clean_memory
(
get_device
())
return
new_pdf_info_dict
if
__name__
==
'__main__'
:
pass
magic_pdf/pre_proc/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/pre_proc/construct_page_dict.py
deleted
100644 → 0
View file @
3bd0ecf1
def
ocr_construct_page_component_v2
(
blocks
,
layout_bboxes
,
page_id
,
page_w
,
page_h
,
layout_tree
,
images
,
tables
,
interline_equations
,
discarded_blocks
,
need_drop
,
drop_reason
):
return_dict
=
{
'preproc_blocks'
:
blocks
,
'layout_bboxes'
:
layout_bboxes
,
'page_idx'
:
page_id
,
'page_size'
:
[
page_w
,
page_h
],
'_layout_tree'
:
layout_tree
,
'images'
:
images
,
'tables'
:
tables
,
'interline_equations'
:
interline_equations
,
'discarded_blocks'
:
discarded_blocks
,
'need_drop'
:
need_drop
,
'drop_reason'
:
drop_reason
,
}
return
return_dict
magic_pdf/pre_proc/cut_image.py
deleted
100644 → 0
View file @
3bd0ecf1
from
loguru
import
logger
from
magic_pdf.config.ocr_content_type
import
ContentType
from
magic_pdf.libs.commons
import
join_path
from
magic_pdf.libs.pdf_image_tools
import
cut_image
def
ocr_cut_image_and_table
(
spans
,
page
,
page_id
,
pdf_bytes_md5
,
imageWriter
):
def
return_path
(
type
):
return
join_path
(
pdf_bytes_md5
,
type
)
for
span
in
spans
:
span_type
=
span
[
'type'
]
if
span_type
==
ContentType
.
Image
:
if
not
check_img_bbox
(
span
[
'bbox'
])
or
not
imageWriter
:
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'images'
),
imageWriter
=
imageWriter
)
elif
span_type
==
ContentType
.
Table
:
if
not
check_img_bbox
(
span
[
'bbox'
])
or
not
imageWriter
:
continue
span
[
'image_path'
]
=
cut_image
(
span
[
'bbox'
],
page_id
,
page
,
return_path
=
return_path
(
'tables'
),
imageWriter
=
imageWriter
)
return
spans
def
check_img_bbox
(
bbox
)
->
bool
:
if
any
([
bbox
[
0
]
>=
bbox
[
2
],
bbox
[
1
]
>=
bbox
[
3
]]):
logger
.
warning
(
f
'image_bboxes: 错误的box,
{
bbox
}
'
)
return
False
return
True
magic_pdf/pre_proc/ocr_span_list_modify.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.config.drop_tag
import
DropTag
from
magic_pdf.config.ocr_content_type
import
BlockType
from
magic_pdf.libs.boxbase
import
calculate_iou
,
get_minbox_if_overlap_by_ratio
def
remove_overlaps_low_confidence_spans
(
spans
):
dropped_spans
=
[]
# 删除重叠spans中置信度低的的那些
for
span1
in
spans
:
for
span2
in
spans
:
if
span1
!=
span2
:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if
span1
in
dropped_spans
or
span2
in
dropped_spans
:
continue
else
:
if
calculate_iou
(
span1
[
'bbox'
],
span2
[
'bbox'
])
>
0.9
:
if
span1
[
'score'
]
<
span2
[
'score'
]:
span_need_remove
=
span1
else
:
span_need_remove
=
span2
if
(
span_need_remove
is
not
None
and
span_need_remove
not
in
dropped_spans
):
dropped_spans
.
append
(
span_need_remove
)
if
len
(
dropped_spans
)
>
0
:
for
span_need_remove
in
dropped_spans
:
spans
.
remove
(
span_need_remove
)
span_need_remove
[
'tag'
]
=
DropTag
.
SPAN_OVERLAP
return
spans
,
dropped_spans
def
check_chars_is_overlap_in_span
(
chars
):
for
i
in
range
(
len
(
chars
)):
for
j
in
range
(
i
+
1
,
len
(
chars
)):
if
calculate_iou
(
chars
[
i
][
'bbox'
],
chars
[
j
][
'bbox'
])
>
0.35
:
return
True
return
False
def
remove_x_overlapping_chars
(
span
,
median_width
):
"""
Remove characters from a span that overlap significantly on the x-axis.
Args:
median_width:
span (dict): A span containing a list of chars, each with bbox coordinates
in the format [x0, y0, x1, y1]
Returns:
dict: The span with overlapping characters removed
"""
if
'chars'
not
in
span
or
len
(
span
[
'chars'
])
<
2
:
return
span
overlap_threshold
=
median_width
*
0.3
i
=
0
while
i
<
len
(
span
[
'chars'
])
-
1
:
char1
=
span
[
'chars'
][
i
]
char2
=
span
[
'chars'
][
i
+
1
]
# Calculate overlap width
x_left
=
max
(
char1
[
'bbox'
][
0
],
char2
[
'bbox'
][
0
])
x_right
=
min
(
char1
[
'bbox'
][
2
],
char2
[
'bbox'
][
2
])
if
x_right
>
x_left
:
# There is overlap
overlap_width
=
x_right
-
x_left
if
overlap_width
>
overlap_threshold
:
if
char1
[
'c'
]
==
char2
[
'c'
]
or
char1
[
'c'
]
==
' '
or
char2
[
'c'
]
==
' '
:
# Determine which character to remove
width1
=
char1
[
'bbox'
][
2
]
-
char1
[
'bbox'
][
0
]
width2
=
char2
[
'bbox'
][
2
]
-
char2
[
'bbox'
][
0
]
if
width1
<
width2
:
# Remove the narrower character
span
[
'chars'
].
pop
(
i
)
else
:
span
[
'chars'
].
pop
(
i
+
1
)
else
:
i
+=
1
# Don't increment i since we need to check the new pair
else
:
i
+=
1
else
:
i
+=
1
return
span
def
remove_overlaps_min_spans
(
spans
):
dropped_spans
=
[]
# 删除重叠spans中较小的那些
for
span1
in
spans
:
for
span2
in
spans
:
if
span1
!=
span2
:
# span1 或 span2 任何一个都不应该在 dropped_spans 中
if
span1
in
dropped_spans
or
span2
in
dropped_spans
:
continue
else
:
overlap_box
=
get_minbox_if_overlap_by_ratio
(
span1
[
'bbox'
],
span2
[
'bbox'
],
0.65
)
if
overlap_box
is
not
None
:
span_need_remove
=
next
((
span
for
span
in
spans
if
span
[
'bbox'
]
==
overlap_box
),
None
)
if
span_need_remove
is
not
None
and
span_need_remove
not
in
dropped_spans
:
dropped_spans
.
append
(
span_need_remove
)
if
len
(
dropped_spans
)
>
0
:
for
span_need_remove
in
dropped_spans
:
spans
.
remove
(
span_need_remove
)
span_need_remove
[
'tag'
]
=
DropTag
.
SPAN_OVERLAP
return
spans
,
dropped_spans
def
get_qa_need_list_v2
(
blocks
):
# 创建 images, tables, interline_equations, inline_equations 的副本
images
=
[]
tables
=
[]
interline_equations
=
[]
for
block
in
blocks
:
if
block
[
'type'
]
==
BlockType
.
Image
:
images
.
append
(
block
)
elif
block
[
'type'
]
==
BlockType
.
Table
:
tables
.
append
(
block
)
elif
block
[
'type'
]
==
BlockType
.
InterlineEquation
:
interline_equations
.
append
(
block
)
return
images
,
tables
,
interline_equations
magic_pdf/pre_proc/remove_bbox_overlap.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.libs.boxbase
import
_is_in
,
_is_part_overlap
def
_remove_overlap_between_bbox
(
bbox1
,
bbox2
):
if
_is_part_overlap
(
bbox1
,
bbox2
):
ix0
,
iy0
,
ix1
,
iy1
=
bbox1
x0
,
y0
,
x1
,
y1
=
bbox2
diff_x
=
min
(
x1
,
ix1
)
-
max
(
x0
,
ix0
)
diff_y
=
min
(
y1
,
iy1
)
-
max
(
y0
,
iy0
)
if
diff_y
>
diff_x
:
if
x1
>=
ix1
:
mid
=
(
x0
+
ix1
)
//
2
ix1
=
min
(
mid
-
0.25
,
ix1
)
x0
=
max
(
mid
+
0.25
,
x0
)
else
:
mid
=
(
ix0
+
x1
)
//
2
ix0
=
max
(
mid
+
0.25
,
ix0
)
x1
=
min
(
mid
-
0.25
,
x1
)
else
:
if
y1
>=
iy1
:
mid
=
(
y0
+
iy1
)
//
2
y0
=
max
(
mid
+
0.25
,
y0
)
iy1
=
min
(
iy1
,
mid
-
0.25
)
else
:
mid
=
(
iy0
+
y1
)
//
2
y1
=
min
(
y1
,
mid
-
0.25
)
iy0
=
max
(
mid
+
0.25
,
iy0
)
if
ix1
>
ix0
and
iy1
>
iy0
and
y1
>
y0
and
x1
>
x0
:
bbox1
=
[
ix0
,
iy0
,
ix1
,
iy1
]
bbox2
=
[
x0
,
y0
,
x1
,
y1
]
return
bbox1
,
bbox2
,
None
else
:
return
bbox1
,
bbox2
,
DropReason
.
NEGATIVE_BBOX_AREA
else
:
return
bbox1
,
bbox2
,
None
def
_remove_overlap_between_bboxes
(
arr
):
drop_reasons
=
[]
N
=
len
(
arr
)
keeps
=
[
True
]
*
N
res
=
[
None
]
*
N
for
i
in
range
(
N
):
for
j
in
range
(
N
):
if
i
==
j
:
continue
if
_is_in
(
arr
[
i
][
'bbox'
],
arr
[
j
][
'bbox'
]):
keeps
[
i
]
=
False
for
idx
,
v
in
enumerate
(
arr
):
if
not
keeps
[
idx
]:
continue
for
i
in
range
(
N
):
if
res
[
i
]
is
None
:
continue
bbox1
,
bbox2
,
drop_reason
=
_remove_overlap_between_bbox
(
v
[
'bbox'
],
res
[
i
][
'bbox'
]
)
if
drop_reason
is
None
:
v
[
'bbox'
]
=
bbox1
res
[
i
][
'bbox'
]
=
bbox2
else
:
if
v
[
'score'
]
>
res
[
i
][
'score'
]:
keeps
[
i
]
=
False
res
[
i
]
=
None
else
:
keeps
[
idx
]
=
False
drop_reasons
.
append
(
drop_reason
)
if
keeps
[
idx
]:
res
[
idx
]
=
v
return
res
,
drop_reasons
def
remove_overlap_between_bbox_for_span
(
spans
):
arr
=
[{
'bbox'
:
span
[
'bbox'
],
'score'
:
span
.
get
(
'score'
,
0.1
)}
for
span
in
spans
]
res
,
drop_reasons
=
_remove_overlap_between_bboxes
(
arr
)
ret
=
[]
for
i
in
range
(
len
(
res
)):
if
res
[
i
]
is
None
:
continue
spans
[
i
][
'bbox'
]
=
res
[
i
][
'bbox'
]
ret
.
append
(
spans
[
i
])
return
ret
,
drop_reasons
def
remove_overlap_between_bbox_for_block
(
all_bboxes
):
arr
=
[{
'bbox'
:
bbox
[:
4
],
'score'
:
bbox
[
-
1
]}
for
bbox
in
all_bboxes
]
res
,
drop_reasons
=
_remove_overlap_between_bboxes
(
arr
)
ret
=
[]
for
i
in
range
(
len
(
res
)):
if
res
[
i
]
is
None
:
continue
all_bboxes
[
i
][:
4
]
=
res
[
i
][
'bbox'
]
ret
.
append
(
all_bboxes
[
i
])
return
ret
,
drop_reasons
magic_pdf/resources/model_config/model_configs.yaml
deleted
100644 → 0
View file @
3bd0ecf1
weights
:
layoutlmv3
:
Layout/LayoutLMv3/model_final.pth
doclayout_yolo
:
Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
yolo_v8_mfd
:
MFD/YOLO/yolo_v8_ft.pt
unimernet_small
:
MFR/unimernet_hf_small_2503
struct_eqtable
:
TabRec/StructEqTable
tablemaster
:
TabRec/TableMaster
rapid_table
:
TabRec/RapidTable
\ No newline at end of file
magic_pdf/resources/slanet_plus/slanet-plus.onnx
deleted
100644 → 0
View file @
3bd0ecf1
File deleted
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt
deleted
100644 → 0
View file @
3bd0ecf1
File deleted
magic_pdf/spark/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/spark/spark_api.py
deleted
100644 → 0
View file @
3bd0ecf1
from
loguru
import
logger
from
magic_pdf.config.drop_reason
import
DropReason
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
'data_source'
)
if
data_source
is
None
:
data_source
=
jso
.
get
(
'file_source'
)
return
data_source
def
get_data_type
(
jso
:
dict
):
data_type
=
jso
.
get
(
'data_type'
)
if
data_type
is
None
:
data_type
=
jso
.
get
(
'file_type'
)
return
data_type
def
get_bookid
(
jso
:
dict
):
book_id
=
jso
.
get
(
'bookid'
)
if
book_id
is
None
:
book_id
=
jso
.
get
(
'original_file_id'
)
return
book_id
def
exception_handler
(
jso
:
dict
,
e
):
logger
.
exception
(
e
)
jso
[
'_need_drop'
]
=
True
jso
[
'_drop_reason'
]
=
DropReason
.
Exception
jso
[
'_exception'
]
=
f
'ERROR:
{
e
}
'
return
jso
def
get_bookname
(
jso
:
dict
):
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
'file_id'
)
book_name
=
f
'
{
data_source
}
/
{
file_id
}
'
return
book_name
def
spark_json_extractor
(
jso
:
dict
)
->
dict
:
"""从json中提取数据,返回一个dict."""
return
{
'_pdf_type'
:
jso
[
'_pdf_type'
],
'model_list'
:
jso
[
'doc_layout_result'
],
}
magic_pdf/tools/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/tools/cli.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
import
shutil
import
tempfile
from
pathlib
import
Path
import
click
import
fitz
from
loguru
import
logger
import
magic_pdf.model
as
model_config
from
magic_pdf.data.batch_build_dataset
import
batch_build_dataset
from
magic_pdf.data.data_reader_writer
import
FileBasedDataReader
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.tools.common
import
batch_do_parse
,
do_parse
,
parse_pdf_methods
from
magic_pdf.utils.office_to_pdf
import
convert_file_to_pdf
pdf_suffixes
=
[
'.pdf'
]
ms_office_suffixes
=
[
'.ppt'
,
'.pptx'
,
'.doc'
,
'.docx'
]
image_suffixes
=
[
'.png'
,
'.jpeg'
,
'.jpg'
]
@
click
.
command
()
@
click
.
version_option
(
__version__
,
'--version'
,
'-v'
,
help
=
'display the version and exit'
)
@
click
.
option
(
'-p'
,
'--path'
,
'path'
,
type
=
click
.
Path
(
exists
=
True
),
required
=
True
,
help
=
'local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files'
,
)
@
click
.
option
(
'-o'
,
'--output-dir'
,
'output_dir'
,
type
=
click
.
Path
(),
required
=
True
,
help
=
'output local directory'
,
)
@
click
.
option
(
'-m'
,
'--method'
,
'method'
,
type
=
parse_pdf_methods
,
help
=
"""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default."""
,
default
=
'auto'
,
)
@
click
.
option
(
'-l'
,
'--lang'
,
'lang'
,
type
=
str
,
help
=
"""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
You should input "Abbreviation" with language form url:
https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations
"""
,
default
=
None
,
)
@
click
.
option
(
'-d'
,
'--debug'
,
'debug_able'
,
type
=
bool
,
help
=
'Enables detailed debugging information during the execution of the CLI commands.'
,
default
=
False
,
)
@
click
.
option
(
'-s'
,
'--start'
,
'start_page_id'
,
type
=
int
,
help
=
'The starting page for PDF parsing, beginning from 0.'
,
default
=
0
,
)
@
click
.
option
(
'-e'
,
'--end'
,
'end_page_id'
,
type
=
int
,
help
=
'The ending page for PDF parsing, beginning from 0.'
,
default
=
None
,
)
def
cli
(
path
,
output_dir
,
method
,
lang
,
debug_able
,
start_page_id
,
end_page_id
):
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
temp_dir
=
tempfile
.
mkdtemp
()
def
read_fn
(
path
:
Path
):
if
path
.
suffix
in
ms_office_suffixes
:
convert_file_to_pdf
(
str
(
path
),
temp_dir
)
fn
=
os
.
path
.
join
(
temp_dir
,
f
'
{
path
.
stem
}
.pdf'
)
elif
path
.
suffix
in
image_suffixes
:
with
open
(
str
(
path
),
'rb'
)
as
f
:
bits
=
f
.
read
()
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
fn
=
os
.
path
.
join
(
temp_dir
,
f
'
{
path
.
stem
}
.pdf'
)
with
open
(
fn
,
'wb'
)
as
f
:
f
.
write
(
pdf_bytes
)
elif
path
.
suffix
in
pdf_suffixes
:
fn
=
str
(
path
)
else
:
raise
Exception
(
f
'Unknown file suffix:
{
path
.
suffix
}
'
)
disk_rw
=
FileBasedDataReader
(
os
.
path
.
dirname
(
fn
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
fn
))
def
parse_doc
(
doc_path
:
Path
,
dataset
:
Dataset
|
None
=
None
):
try
:
file_name
=
str
(
Path
(
doc_path
).
stem
)
if
dataset
is
None
:
pdf_data_or_dataset
=
read_fn
(
doc_path
)
else
:
pdf_data_or_dataset
=
dataset
do_parse
(
output_dir
,
file_name
,
pdf_data_or_dataset
,
[],
method
,
debug_able
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
)
except
Exception
as
e
:
logger
.
exception
(
e
)
if
os
.
path
.
isdir
(
path
):
doc_paths
=
[]
for
doc_path
in
Path
(
path
).
glob
(
'*'
):
if
doc_path
.
suffix
in
pdf_suffixes
+
image_suffixes
+
ms_office_suffixes
:
if
doc_path
.
suffix
in
ms_office_suffixes
:
convert_file_to_pdf
(
str
(
doc_path
),
temp_dir
)
doc_path
=
Path
(
os
.
path
.
join
(
temp_dir
,
f
'
{
doc_path
.
stem
}
.pdf'
))
elif
doc_path
.
suffix
in
image_suffixes
:
with
open
(
str
(
doc_path
),
'rb'
)
as
f
:
bits
=
f
.
read
()
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
fn
=
os
.
path
.
join
(
temp_dir
,
f
'
{
doc_path
.
stem
}
.pdf'
)
with
open
(
fn
,
'wb'
)
as
f
:
f
.
write
(
pdf_bytes
)
doc_path
=
Path
(
fn
)
doc_paths
.
append
(
doc_path
)
datasets
=
batch_build_dataset
(
doc_paths
,
4
,
lang
)
batch_do_parse
(
output_dir
,
[
str
(
doc_path
.
stem
)
for
doc_path
in
doc_paths
],
datasets
,
method
,
debug_able
,
lang
=
lang
)
else
:
parse_doc
(
Path
(
path
))
shutil
.
rmtree
(
temp_dir
)
if
__name__
==
'__main__'
:
cli
()
magic_pdf/tools/cli_dev.py
deleted
100644 → 0
View file @
3bd0ecf1
import
json
as
json_parse
import
os
from
pathlib
import
Path
import
click
import
magic_pdf.model
as
model_config
from
magic_pdf.data.data_reader_writer
import
FileBasedDataReader
,
S3DataReader
from
magic_pdf.libs.config_reader
import
get_s3_config
from
magic_pdf.libs.path_utils
import
(
parse_s3_range_params
,
parse_s3path
,
remove_non_official_s3_args
)
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.tools.common
import
do_parse
,
parse_pdf_methods
def
read_s3_path
(
s3path
):
bucket
,
key
=
parse_s3path
(
s3path
)
s3_ak
,
s3_sk
,
s3_endpoint
=
get_s3_config
(
bucket
)
s3_rw
=
S3DataReader
(
''
,
bucket
,
s3_ak
,
s3_sk
,
s3_endpoint
,
'auto'
)
may_range_params
=
parse_s3_range_params
(
s3path
)
if
may_range_params
is
None
or
2
!=
len
(
may_range_params
):
byte_start
,
byte_end
=
0
,
-
1
else
:
byte_start
,
byte_end
=
int
(
may_range_params
[
0
]),
int
(
may_range_params
[
1
])
return
s3_rw
.
read_at
(
remove_non_official_s3_args
(
s3path
),
byte_start
,
byte_end
,
)
@
click
.
group
()
@
click
.
version_option
(
__version__
,
'--version'
,
'-v'
,
help
=
'显示版本信息'
)
def
cli
():
pass
@
cli
.
command
()
@
click
.
option
(
'-j'
,
'--jsonl'
,
'jsonl'
,
type
=
str
,
help
=
'输入 jsonl 路径,本地或者 s3 上的文件'
,
required
=
True
,
)
@
click
.
option
(
'-m'
,
'--method'
,
'method'
,
type
=
parse_pdf_methods
,
help
=
'指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法'
,
default
=
'auto'
,
)
@
click
.
option
(
'-o'
,
'--output-dir'
,
'output_dir'
,
type
=
click
.
Path
(),
required
=
True
,
help
=
'输出到本地目录'
,
)
def
jsonl
(
jsonl
,
method
,
output_dir
):
model_config
.
__use_inside_model__
=
False
if
jsonl
.
startswith
(
's3://'
):
jso
=
json_parse
.
loads
(
read_s3_path
(
jsonl
).
decode
(
'utf-8'
))
else
:
with
open
(
jsonl
)
as
f
:
jso
=
json_parse
.
loads
(
f
.
readline
())
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
s3_file_path
=
jso
.
get
(
'file_location'
)
if
s3_file_path
is
None
:
s3_file_path
=
jso
.
get
(
'path'
)
pdf_file_name
=
Path
(
s3_file_path
).
stem
pdf_data
=
read_s3_path
(
s3_file_path
)
print
(
pdf_file_name
,
jso
,
method
)
do_parse
(
output_dir
,
pdf_file_name
,
pdf_data
,
jso
[
'doc_layout_result'
],
method
,
False
,
f_dump_content_list
=
True
,
f_draw_model_bbox
=
True
,
)
@
cli
.
command
()
@
click
.
option
(
'-p'
,
'--pdf'
,
'pdf'
,
type
=
click
.
Path
(
exists
=
True
),
required
=
True
,
help
=
'本地 PDF 文件'
,
)
@
click
.
option
(
'-j'
,
'--json'
,
'json_data'
,
type
=
click
.
Path
(
exists
=
True
),
required
=
True
,
help
=
'本地模型推理出的 json 数据'
,
)
@
click
.
option
(
'-o'
,
'--output-dir'
,
'output_dir'
,
type
=
click
.
Path
(),
required
=
True
,
help
=
'本地输出目录'
)
@
click
.
option
(
'-m'
,
'--method'
,
'method'
,
type
=
parse_pdf_methods
,
help
=
'指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法'
,
default
=
'auto'
,
)
def
pdf
(
pdf
,
json_data
,
output_dir
,
method
):
model_config
.
__use_inside_model__
=
False
full_pdf_path
=
os
.
path
.
realpath
(
pdf
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
def
read_fn
(
path
):
disk_rw
=
FileBasedDataReader
(
os
.
path
.
dirname
(
path
))
return
disk_rw
.
read
(
os
.
path
.
basename
(
path
))
model_json_list
=
json_parse
.
loads
(
read_fn
(
json_data
).
decode
(
'utf-8'
))
file_name
=
str
(
Path
(
full_pdf_path
).
stem
)
pdf_data
=
read_fn
(
full_pdf_path
)
do_parse
(
output_dir
,
file_name
,
pdf_data
,
model_json_list
,
method
,
False
,
f_dump_content_list
=
True
,
f_draw_model_bbox
=
True
,
)
if
__name__
==
'__main__'
:
cli
()
magic_pdf/tools/common.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
import
click
import
fitz
from
loguru
import
logger
import
magic_pdf.model
as
model_config
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
Dataset
,
PymuDocDataset
from
magic_pdf.libs.draw_bbox
import
draw_char_bbox
from
magic_pdf.model.doc_analyze_by_custom_model
import
(
batch_doc_analyze
,
doc_analyze
)
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
def
prepare_env
(
output_dir
,
pdf_file_name
,
method
):
local_parent_dir
=
os
.
path
.
join
(
output_dir
,
pdf_file_name
,
method
)
local_image_dir
=
os
.
path
.
join
(
str
(
local_parent_dir
),
'images'
)
local_md_dir
=
local_parent_dir
os
.
makedirs
(
local_image_dir
,
exist_ok
=
True
)
os
.
makedirs
(
local_md_dir
,
exist_ok
=
True
)
return
local_image_dir
,
local_md_dir
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
# # 将字节数据包装在 BytesIO 对象中
# pdf_file = BytesIO(pdf_bytes)
# # 读取 PDF 的字节数据
# reader = PdfReader(pdf_file)
# # 创建一个新的 PDF 写入器
# writer = PdfWriter()
# # 将所有页面添加到新的 PDF 写入器中
# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
# if end_page_id > len(reader.pages) - 1:
# logger.warning("end_page_id is out of range, use pdf_docs length")
# end_page_id = len(reader.pages) - 1
# for i, page in enumerate(reader.pages):
# if start_page_id <= i <= end_page_id:
# writer.add_page(page)
# # 创建一个字节缓冲区来存储输出的 PDF 数据
# output_buffer = BytesIO()
# # 将 PDF 写入字节缓冲区
# writer.write(output_buffer)
# # 获取字节缓冲区的内容
# converted_pdf_bytes = output_buffer.getvalue()
# return converted_pdf_bytes
def
convert_pdf_bytes_to_bytes_by_pymupdf
(
pdf_bytes
,
start_page_id
=
0
,
end_page_id
=
None
):
document
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
output_document
=
fitz
.
open
()
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
len
(
document
)
-
1
)
if
end_page_id
>
len
(
document
)
-
1
:
logger
.
warning
(
'end_page_id is out of range, use pdf_docs length'
)
end_page_id
=
len
(
document
)
-
1
output_document
.
insert_pdf
(
document
,
from_page
=
start_page_id
,
to_page
=
end_page_id
)
output_bytes
=
output_document
.
tobytes
()
return
output_bytes
def
_do_parse
(
output_dir
,
pdf_file_name
,
pdf_bytes_or_dataset
,
model_list
,
parse_method
,
debug_able
=
False
,
f_draw_span_bbox
=
True
,
f_draw_layout_bbox
=
True
,
f_dump_md
=
True
,
f_dump_middle_json
=
True
,
f_dump_model_json
=
True
,
f_dump_orig_pdf
=
True
,
f_dump_content_list
=
True
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_draw_model_bbox
=
False
,
f_draw_line_sort_bbox
=
False
,
f_draw_char_bbox
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
from
magic_pdf.operators.models
import
InferenceResult
if
debug_able
:
logger
.
warning
(
'debug mode is on'
)
f_draw_model_bbox
=
True
f_draw_line_sort_bbox
=
True
# f_draw_char_bbox = True
if
isinstance
(
pdf_bytes_or_dataset
,
bytes
):
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pymupdf
(
pdf_bytes_or_dataset
,
start_page_id
,
end_page_id
)
ds
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
else
:
ds
=
pdf_bytes_or_dataset
pdf_bytes
=
ds
.
_raw_data
local_image_dir
,
local_md_dir
=
prepare_env
(
output_dir
,
pdf_file_name
,
parse_method
)
image_writer
,
md_writer
=
FileBasedDataWriter
(
local_image_dir
),
FileBasedDataWriter
(
local_md_dir
)
image_dir
=
str
(
os
.
path
.
basename
(
local_image_dir
))
if
len
(
model_list
)
==
0
:
if
model_config
.
__use_inside_model__
:
if
parse_method
==
'auto'
:
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
TXT
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
,
lang
=
ds
.
_lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
else
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
,
lang
=
ds
.
_lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
elif
parse_method
==
'txt'
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
False
,
lang
=
ds
.
_lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
elif
parse_method
==
'ocr'
:
infer_result
=
ds
.
apply
(
doc_analyze
,
ocr
=
True
,
lang
=
ds
.
_lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
)
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
else
:
logger
.
error
(
'unknown parse method'
)
exit
(
1
)
else
:
logger
.
error
(
'need model list input'
)
exit
(
2
)
else
:
infer_result
=
InferenceResult
(
model_list
,
ds
)
if
parse_method
==
'ocr'
:
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
elif
parse_method
==
'txt'
:
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
else
:
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
TXT
:
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
else
:
pipe_result
=
infer_result
.
pipe_ocr_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
ds
.
_lang
)
if
f_draw_model_bbox
:
infer_result
.
draw_model
(
os
.
path
.
join
(
local_md_dir
,
f
'
{
pdf_file_name
}
_model.pdf'
)
)
if
f_draw_layout_bbox
:
pipe_result
.
draw_layout
(
os
.
path
.
join
(
local_md_dir
,
f
'
{
pdf_file_name
}
_layout.pdf'
)
)
if
f_draw_span_bbox
:
pipe_result
.
draw_span
(
os
.
path
.
join
(
local_md_dir
,
f
'
{
pdf_file_name
}
_spans.pdf'
))
if
f_draw_line_sort_bbox
:
pipe_result
.
draw_line_sort
(
os
.
path
.
join
(
local_md_dir
,
f
'
{
pdf_file_name
}
_line_sort.pdf'
)
)
if
f_draw_char_bbox
:
draw_char_bbox
(
pdf_bytes
,
local_md_dir
,
f
'
{
pdf_file_name
}
_char_bbox.pdf'
)
if
f_dump_md
:
pipe_result
.
dump_md
(
md_writer
,
f
'
{
pdf_file_name
}
.md'
,
image_dir
,
drop_mode
=
DropMode
.
NONE
,
md_make_mode
=
f_make_md_mode
,
)
if
f_dump_middle_json
:
pipe_result
.
dump_middle_json
(
md_writer
,
f
'
{
pdf_file_name
}
_middle.json'
)
if
f_dump_model_json
:
infer_result
.
dump_model
(
md_writer
,
f
'
{
pdf_file_name
}
_model.json'
)
if
f_dump_orig_pdf
:
md_writer
.
write
(
f
'
{
pdf_file_name
}
_origin.pdf'
,
pdf_bytes
,
)
if
f_dump_content_list
:
pipe_result
.
dump_content_list
(
md_writer
,
f
'
{
pdf_file_name
}
_content_list.json'
,
image_dir
)
logger
.
info
(
f
'local output dir is
{
local_md_dir
}
'
)
def
do_parse
(
output_dir
,
pdf_file_name
,
pdf_bytes_or_dataset
,
model_list
,
parse_method
,
debug_able
=
False
,
f_draw_span_bbox
=
True
,
f_draw_layout_bbox
=
True
,
f_dump_md
=
True
,
f_dump_middle_json
=
True
,
f_dump_model_json
=
True
,
f_dump_orig_pdf
=
True
,
f_dump_content_list
=
True
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_draw_model_bbox
=
False
,
f_draw_line_sort_bbox
=
False
,
f_draw_char_bbox
=
False
,
start_page_id
=
0
,
end_page_id
=
None
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
parallel_count
=
1
if
os
.
environ
.
get
(
'MINERU_PARALLEL_INFERENCE_COUNT'
):
parallel_count
=
int
(
os
.
environ
[
'MINERU_PARALLEL_INFERENCE_COUNT'
])
if
parallel_count
>
1
:
if
isinstance
(
pdf_bytes_or_dataset
,
bytes
):
pdf_bytes
=
convert_pdf_bytes_to_bytes_by_pymupdf
(
pdf_bytes_or_dataset
,
start_page_id
,
end_page_id
)
ds
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
else
:
ds
=
pdf_bytes_or_dataset
batch_do_parse
(
output_dir
,
[
pdf_file_name
],
[
ds
],
parse_method
,
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
,
lang
=
lang
)
else
:
_do_parse
(
output_dir
,
pdf_file_name
,
pdf_bytes_or_dataset
,
model_list
,
parse_method
,
debug_able
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
f_make_md_mode
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
)
def
batch_do_parse
(
output_dir
,
pdf_file_names
:
list
[
str
],
pdf_bytes_or_datasets
:
list
[
bytes
|
Dataset
],
parse_method
,
debug_able
=
False
,
f_draw_span_bbox
=
True
,
f_draw_layout_bbox
=
True
,
f_dump_md
=
True
,
f_dump_middle_json
=
True
,
f_dump_model_json
=
True
,
f_dump_orig_pdf
=
True
,
f_dump_content_list
=
True
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_draw_model_bbox
=
False
,
f_draw_line_sort_bbox
=
False
,
f_draw_char_bbox
=
False
,
lang
=
None
,
layout_model
=
None
,
formula_enable
=
None
,
table_enable
=
None
,
):
dss
=
[]
for
v
in
pdf_bytes_or_datasets
:
if
isinstance
(
v
,
bytes
):
dss
.
append
(
PymuDocDataset
(
v
,
lang
=
lang
))
else
:
dss
.
append
(
v
)
infer_results
=
batch_doc_analyze
(
dss
,
parse_method
,
lang
=
lang
,
layout_model
=
layout_model
,
formula_enable
=
formula_enable
,
table_enable
=
table_enable
)
for
idx
,
infer_result
in
enumerate
(
infer_results
):
_do_parse
(
output_dir
=
output_dir
,
pdf_file_name
=
pdf_file_names
[
idx
],
pdf_bytes_or_dataset
=
dss
[
idx
],
model_list
=
infer_result
.
get_infer_res
(),
parse_method
=
parse_method
,
debug_able
=
debug_able
,
f_draw_span_bbox
=
f_draw_span_bbox
,
f_draw_layout_bbox
=
f_draw_layout_bbox
,
f_dump_md
=
f_dump_md
,
f_dump_middle_json
=
f_dump_middle_json
,
f_dump_model_json
=
f_dump_model_json
,
f_dump_orig_pdf
=
f_dump_orig_pdf
,
f_dump_content_list
=
f_dump_content_list
,
f_make_md_mode
=
MakeMode
.
MM_MD
,
f_draw_model_bbox
=
f_draw_model_bbox
,
f_draw_line_sort_bbox
=
f_draw_line_sort_bbox
,
f_draw_char_bbox
=
f_draw_char_bbox
,
lang
=
lang
,
)
parse_pdf_methods
=
click
.
Choice
([
'ocr'
,
'txt'
,
'auto'
])
magic_pdf/utils/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/utils/annotations.py
deleted
100644 → 0
View file @
3bd0ecf1
from
loguru
import
logger
def
ImportPIL
(
f
):
try
:
import
PIL
# noqa: F401
except
ImportError
:
logger
.
error
(
'Pillow not installed, please install by pip.'
)
exit
(
1
)
return
f
magic_pdf/utils/office_to_pdf.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
import
subprocess
import
platform
from
pathlib
import
Path
import
shutil
from
loguru
import
logger
class
ConvertToPdfError
(
Exception
):
def
__init__
(
self
,
msg
):
self
.
msg
=
msg
super
().
__init__
(
self
.
msg
)
def
check_fonts_installed
():
"""Check if required Chinese fonts are installed."""
system_type
=
platform
.
system
()
if
system_type
in
[
'Windows'
,
'Darwin'
]:
pass
else
:
# Linux: use fc-list
try
:
output
=
subprocess
.
check_output
([
'fc-list'
,
':lang=zh'
],
encoding
=
'utf-8'
)
if
output
.
strip
():
# 只要有任何输出(非空)
return
True
else
:
logger
.
warning
(
f
"No Chinese fonts were detected, the converted document may not display Chinese content properly."
)
except
Exception
:
pass
def
get_soffice_command
():
"""Return the path to LibreOffice's soffice executable depending on the platform."""
system_type
=
platform
.
system
()
# First check if soffice is in PATH
soffice_path
=
shutil
.
which
(
'soffice'
)
if
soffice_path
:
return
soffice_path
if
system_type
==
'Windows'
:
# Check common installation paths
possible_paths
=
[
Path
(
os
.
environ
.
get
(
'PROGRAMFILES'
,
'C:/Program Files'
))
/
'LibreOffice/program/soffice.exe'
,
Path
(
os
.
environ
.
get
(
'PROGRAMFILES(X86)'
,
'C:/Program Files (x86)'
))
/
'LibreOffice/program/soffice.exe'
,
Path
(
'C:/Program Files/LibreOffice/program/soffice.exe'
),
Path
(
'C:/Program Files (x86)/LibreOffice/program/soffice.exe'
)
]
# Check other drives for windows
for
drive
in
[
'C:'
,
'D:'
,
'E:'
,
'F:'
,
'G:'
,
'H:'
]:
possible_paths
.
append
(
Path
(
f
"
{
drive
}
/LibreOffice/program/soffice.exe"
))
for
path
in
possible_paths
:
if
path
.
exists
():
return
str
(
path
)
raise
ConvertToPdfError
(
"LibreOffice not found. Please install LibreOffice from https://www.libreoffice.org/ "
"or ensure soffice.exe is in your PATH environment variable."
)
else
:
# For Linux/macOS, provide installation instructions if not found
try
:
# Try to find soffice in standard locations
possible_paths
=
[
'/usr/bin/soffice'
,
'/usr/local/bin/soffice'
,
'/opt/libreoffice/program/soffice'
,
'/Applications/LibreOffice.app/Contents/MacOS/soffice'
]
for
path
in
possible_paths
:
if
os
.
path
.
exists
(
path
):
return
path
raise
ConvertToPdfError
(
"LibreOffice not found. Please install it:
\n
"
" - Ubuntu/Debian: sudo apt-get install libreoffice
\n
"
" - CentOS/RHEL: sudo yum install libreoffice
\n
"
" - macOS: brew install libreoffice or download from https://www.libreoffice.org/
\n
"
" - Or ensure soffice is in your PATH environment variable."
)
except
Exception
as
e
:
raise
ConvertToPdfError
(
f
"Error locating LibreOffice:
{
str
(
e
)
}
"
)
def
convert_file_to_pdf
(
input_path
,
output_dir
):
"""Convert a single document (ppt, doc, etc.) to PDF."""
if
not
os
.
path
.
isfile
(
input_path
):
raise
FileNotFoundError
(
f
"The input file
{
input_path
}
does not exist."
)
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
check_fonts_installed
()
soffice_cmd
=
get_soffice_command
()
cmd
=
[
soffice_cmd
,
'--headless'
,
'--norestore'
,
'--invisible'
,
'--convert-to'
,
'pdf'
,
'--outdir'
,
str
(
output_dir
),
str
(
input_path
)
]
process
=
subprocess
.
run
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
if
process
.
returncode
!=
0
:
raise
ConvertToPdfError
(
f
"LibreOffice convert failed:
{
process
.
stderr
.
decode
()
}
"
)
m
agic-pdf
.template.json
→
m
ineru
.template.json
View file @
0c7a0882
...
@@ -3,23 +3,6 @@
...
@@ -3,23 +3,6 @@
"bucket-name-1"
:[
"ak"
,
"sk"
,
"endpoint"
],
"bucket-name-1"
:[
"ak"
,
"sk"
,
"endpoint"
],
"bucket-name-2"
:[
"ak"
,
"sk"
,
"endpoint"
]
"bucket-name-2"
:[
"ak"
,
"sk"
,
"endpoint"
]
},
},
"models-dir"
:
"/tmp/models"
,
"layoutreader-model-dir"
:
"/tmp/layoutreader"
,
"device-mode"
:
"cpu"
,
"layout-config"
:
{
"model"
:
"doclayout_yolo"
},
"formula-config"
:
{
"mfd_model"
:
"yolo_v8_mfd"
,
"mfr_model"
:
"unimernet_small"
,
"enable"
:
true
},
"table-config"
:
{
"model"
:
"rapid_table"
,
"sub_model"
:
"slanet_plus"
,
"enable"
:
true
,
"max_time"
:
400
},
"latex-delimiter-config"
:
{
"latex-delimiter-config"
:
{
"display"
:
{
"display"
:
{
"left"
:
"$$"
,
"left"
:
"$$"
,
...
@@ -31,18 +14,6 @@
...
@@ -31,18 +14,6 @@
}
}
},
},
"llm-aided-config"
:
{
"llm-aided-config"
:
{
"formula_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-7b-instruct"
,
"enable"
:
false
},
"text_aided"
:
{
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"model"
:
"qwen2.5-7b-instruct"
,
"enable"
:
false
},
"title_aided"
:
{
"title_aided"
:
{
"api_key"
:
"your_api_key"
,
"api_key"
:
"your_api_key"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
"base_url"
:
"https://dashscope.aliyuncs.com/compatible-mode/v1"
,
...
@@ -50,5 +21,9 @@
...
@@ -50,5 +21,9 @@
"enable"
:
false
"enable"
:
false
}
}
},
},
"config_version"
:
"1.2.1"
"models-dir"
:
{
"pipeline"
:
""
,
"vlm"
:
""
},
"config_version"
:
"1.3.0"
}
}
\ No newline at end of file
Prev
1
2
3
4
5
6
7
8
9
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment