Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0c7a0882
Unverified
Commit
0c7a0882
authored
Jun 12, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jun 12, 2025
Browse files
Merge pull request #2611 from myhloli/dev
Dev
parents
3bd0ecf1
a392f445
Changes
262
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2036 deletions
+0
-2036
.pre-commit-config.yaml
.pre-commit-config.yaml
+0
-47
magic_pdf/config/constants.py
magic_pdf/config/constants.py
+0
-60
magic_pdf/config/drop_reason.py
magic_pdf/config/drop_reason.py
+0
-35
magic_pdf/config/drop_tag.py
magic_pdf/config/drop_tag.py
+0
-19
magic_pdf/config/enums.py
magic_pdf/config/enums.py
+0
-7
magic_pdf/config/make_content_config.py
magic_pdf/config/make_content_config.py
+0
-11
magic_pdf/config/model_block_type.py
magic_pdf/config/model_block_type.py
+0
-10
magic_pdf/config/ocr_content_type.py
magic_pdf/config/ocr_content_type.py
+0
-40
magic_pdf/data/batch_build_dataset.py
magic_pdf/data/batch_build_dataset.py
+0
-167
magic_pdf/data/data_reader_writer/__init__.py
magic_pdf/data/data_reader_writer/__init__.py
+0
-12
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+0
-408
magic_pdf/data/io/__init__.py
magic_pdf/data/io/__init__.py
+0
-6
magic_pdf/data/read_api.py
magic_pdf/data/read_api.py
+0
-142
magic_pdf/data/utils.py
magic_pdf/data/utils.py
+0
-166
magic_pdf/filter/__init__.py
magic_pdf/filter/__init__.py
+0
-32
magic_pdf/filter/pdf_classify_by_type.py
magic_pdf/filter/pdf_classify_by_type.py
+0
-395
magic_pdf/filter/pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+0
-397
magic_pdf/integrations/__init__.py
magic_pdf/integrations/__init__.py
+0
-0
magic_pdf/integrations/rag/__init__.py
magic_pdf/integrations/rag/__init__.py
+0
-0
magic_pdf/integrations/rag/api.py
magic_pdf/integrations/rag/api.py
+0
-82
No files found.
.pre-commit-config.yaml
deleted
100644 → 0
View file @
3bd0ecf1
repos
:
-
repo
:
https://github.com/PyCQA/flake8
rev
:
5.0.4
hooks
:
-
id
:
flake8
args
:
[
"
--max-line-length=150"
,
"
--ignore=E131,E125,W503,W504,E203"
]
-
repo
:
https://github.com/PyCQA/isort
rev
:
5.11.5
hooks
:
-
id
:
isort
-
repo
:
https://github.com/pre-commit/mirrors-yapf
rev
:
v0.32.0
hooks
:
-
id
:
yapf
args
:
[
"
--style={based_on_style:
google,
column_limit:
150,
indent_width:
4}"
]
-
repo
:
https://github.com/codespell-project/codespell
rev
:
v2.2.1
hooks
:
-
id
:
codespell
args
:
[
'
--skip'
,
'
*.json'
]
-
repo
:
https://github.com/pre-commit/pre-commit-hooks
rev
:
v4.3.0
hooks
:
-
id
:
trailing-whitespace
-
id
:
check-yaml
-
id
:
end-of-file-fixer
-
id
:
requirements-txt-fixer
-
id
:
double-quote-string-fixer
-
id
:
check-merge-conflict
-
id
:
fix-encoding-pragma
args
:
[
"
--remove"
]
-
id
:
mixed-line-ending
args
:
[
"
--fix=lf"
]
-
repo
:
https://github.com/executablebooks/mdformat
rev
:
0.7.9
hooks
:
-
id
:
mdformat
args
:
[
"
--number"
,
"
--table-width"
,
"
200"
]
additional_dependencies
:
-
mdformat-openmmlab
-
mdformat_frontmatter
-
linkify-it-py
-
repo
:
https://github.com/myint/docformatter
rev
:
v1.3.1
hooks
:
-
id
:
docformatter
args
:
[
"
--in-place"
,
"
--wrap-descriptions"
,
"
119"
]
magic_pdf/config/constants.py
deleted
100644 → 0
View file @
3bd0ecf1
"""span维度自定义字段."""
# span是否是跨页合并的
CROSS_PAGE
=
'cross_page'
"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED
=
'lines_deleted'
# table recognition max time default value
TABLE_MAX_TIME_VALUE
=
400
# pp_table_result_max_length
TABLE_MAX_LEN
=
480
# table master structure dict
TABLE_MASTER_DICT
=
'table_master_structure_dict.txt'
# table master dir
TABLE_MASTER_DIR
=
'table_structure_tablemaster_infer/'
# pp detect model dir
DETECT_MODEL_DIR
=
'ch_PP-OCRv4_det_infer'
# pp rec model dir
REC_MODEL_DIR
=
'ch_PP-OCRv4_rec_infer'
# pp rec char dict path
REC_CHAR_DICT
=
'ppocr_keys_v1.txt'
# pp rec copy rec directory
PP_REC_DIRECTORY
=
'.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
# pp rec copy det directory
PP_DET_DIRECTORY
=
'.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
class
MODEL_NAME
:
# pp table structure algorithm
TABLE_MASTER
=
'tablemaster'
# struct eqtable
STRUCT_EQTABLE
=
'struct_eqtable'
DocLayout_YOLO
=
'doclayout_yolo'
LAYOUTLMv3
=
'layoutlmv3'
YOLO_V8_MFD
=
'yolo_v8_mfd'
UniMerNet_v2_Small
=
'unimernet_small'
RAPID_TABLE
=
'rapid_table'
YOLO_V11_LangDetect
=
'yolo_v11n_langdetect'
PARSE_TYPE_TXT
=
'txt'
PARSE_TYPE_OCR
=
'ocr'
magic_pdf/config/drop_reason.py
deleted
100644 → 0
View file @
3bd0ecf1
class
DropReason
:
TEXT_BLCOK_HOR_OVERLAP
=
'text_block_horizontal_overlap'
# 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP
=
(
'useful_block_horizontal_overlap'
# 需保留的block水平覆盖
)
COMPLICATED_LAYOUT
=
'complicated_layout'
# 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS
=
'too_many_layout_columns'
# 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX
=
'color_background_text_box'
# 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS
=
(
'high_computational_load_by_imgs'
# 含特殊图片,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_SVGS
=
(
'high_computational_load_by_svgs'
# 特殊的SVG图,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
=
'high_computational_load_by_total_pages'
# 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT
=
'missing doc_layout_result'
# 版面分析失败
Exception
=
'_exception'
# 解析中发生异常
ENCRYPTED
=
'encrypted'
# PDF是加密的
EMPTY_PDF
=
'total_page=0'
# PDF页面总数为0
NOT_IS_TEXT_PDF
=
'not_is_text_pdf'
# 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK
=
'dense_single_line_block'
# 无法清晰的分段
TITLE_DETECTION_FAILED
=
'title_detection_failed'
# 探测标题失败
TITLE_LEVEL_FAILED
=
(
'title_level_failed'
# 分析标题级别失败(例如一级、二级、三级标题)
)
PARA_SPLIT_FAILED
=
'para_split_failed'
# 识别段落失败
PARA_MERGE_FAILED
=
'para_merge_failed'
# 段落合并失败
NOT_ALLOW_LANGUAGE
=
'not_allow_language'
# 不支持的语种
SPECIAL_PDF
=
'special_pdf'
PSEUDO_SINGLE_COLUMN
=
'pseudo_single_column'
# 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT
=
'can_not_detect_page_layout'
# 无法分析页面的版面
NEGATIVE_BBOX_AREA
=
'negative_bbox_area'
# 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION
=
(
'overlap_blocks_can_t_separation'
# 无法分离重叠的block
)
magic_pdf/config/drop_tag.py
deleted
100644 → 0
View file @
3bd0ecf1
COLOR_BG_HEADER_TXT_BLOCK
=
'color_background_header_txt_block'
PAGE_NO
=
'page-no'
# 页码
CONTENT_IN_FOOT_OR_HEADER
=
'in-foot-header-area'
# 页眉页脚内的文本
VERTICAL_TEXT
=
'vertical-text'
# 垂直文本
ROTATE_TEXT
=
'rotate-text'
# 旋转文本
EMPTY_SIDE_BLOCK
=
'empty-side-block'
# 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT
=
'on-image-text'
# 文本在图片上
ON_TABLE_TEXT
=
'on-table-text'
# 文本在表格上
class
DropTag
:
PAGE_NUMBER
=
'page_no'
HEADER
=
'header'
FOOTER
=
'footer'
FOOTNOTE
=
'footnote'
NOT_IN_LAYOUT
=
'not_in_layout'
SPAN_OVERLAP
=
'span_overlap'
BLOCK_OVERLAP
=
'block_overlap'
magic_pdf/config/enums.py
deleted
100644 → 0
View file @
3bd0ecf1
import
enum
class
SupportedPdfParseMethod
(
enum
.
Enum
):
OCR
=
'ocr'
TXT
=
'txt'
magic_pdf/config/make_content_config.py
deleted
100644 → 0
View file @
3bd0ecf1
class
MakeMode
:
MM_MD
=
'mm_markdown'
NLP_MD
=
'nlp_markdown'
STANDARD_FORMAT
=
'standard_format'
class
DropMode
:
WHOLE_PDF
=
'whole_pdf'
SINGLE_PAGE
=
'single_page'
NONE
=
'none'
NONE_WITH_REASON
=
'none_with_reason'
magic_pdf/config/model_block_type.py
deleted
100644 → 0
View file @
3bd0ecf1
from
enum
import
Enum
class
ModelBlockTypeEnum
(
Enum
):
TITLE
=
0
PLAIN_TEXT
=
1
ABANDON
=
2
ISOLATE_FORMULA
=
8
EMBEDDING
=
13
ISOLATED
=
14
magic_pdf/config/ocr_content_type.py
deleted
100644 → 0
View file @
3bd0ecf1
class
ContentType
:
Image
=
'image'
Table
=
'table'
Text
=
'text'
InlineEquation
=
'inline_equation'
InterlineEquation
=
'interline_equation'
class
BlockType
:
Image
=
'image'
ImageBody
=
'image_body'
ImageCaption
=
'image_caption'
ImageFootnote
=
'image_footnote'
Table
=
'table'
TableBody
=
'table_body'
TableCaption
=
'table_caption'
TableFootnote
=
'table_footnote'
Text
=
'text'
Title
=
'title'
InterlineEquation
=
'interline_equation'
Footnote
=
'footnote'
Discarded
=
'discarded'
List
=
'list'
Index
=
'index'
class
CategoryId
:
Title
=
0
Text
=
1
Abandon
=
2
ImageBody
=
3
ImageCaption
=
4
TableBody
=
5
TableCaption
=
6
TableFootnote
=
7
InterlineEquation_Layout
=
8
InlineEquation
=
13
InterlineEquation_YOLO
=
14
OcrText
=
15
ImageFootnote
=
101
magic_pdf/data/batch_build_dataset.py
deleted
100644 → 0
View file @
3bd0ecf1
import
concurrent.futures
import
fitz
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.utils
import
fitz_doc_to_image
# PyMuPDF
def
partition_array_greedy
(
arr
,
k
):
"""Partition an array into k parts using a simple greedy approach.
Parameters:
-----------
arr : list
The input array of integers
k : int
Number of partitions to create
Returns:
--------
partitions : list of lists
The k partitions of the array
"""
# Handle edge cases
if
k
<=
0
:
raise
ValueError
(
'k must be a positive integer'
)
if
k
>
len
(
arr
):
k
=
len
(
arr
)
# Adjust k if it's too large
if
k
==
1
:
return
[
list
(
range
(
len
(
arr
)))]
if
k
==
len
(
arr
):
return
[[
i
]
for
i
in
range
(
len
(
arr
))]
# Sort the array in descending order
sorted_indices
=
sorted
(
range
(
len
(
arr
)),
key
=
lambda
i
:
arr
[
i
][
1
],
reverse
=
True
)
# Initialize k empty partitions
partitions
=
[[]
for
_
in
range
(
k
)]
partition_sums
=
[
0
]
*
k
# Assign each element to the partition with the smallest current sum
for
idx
in
sorted_indices
:
# Find the partition with the smallest sum
min_sum_idx
=
partition_sums
.
index
(
min
(
partition_sums
))
# Add the element to this partition
partitions
[
min_sum_idx
].
append
(
idx
)
# Store the original index
partition_sums
[
min_sum_idx
]
+=
arr
[
idx
][
1
]
return
partitions
def
process_pdf_batch
(
pdf_jobs
,
idx
):
"""Process a batch of PDF pages using multiple threads.
Parameters:
-----------
pdf_jobs : list of tuples
List of (pdf_path, page_num) tuples
output_dir : str or None
Directory to save images to
num_threads : int
Number of threads to use
**kwargs :
Additional arguments for process_pdf_page
Returns:
--------
images : list
List of processed images
"""
images
=
[]
for
pdf_path
,
_
in
pdf_jobs
:
doc
=
fitz
.
open
(
pdf_path
)
tmp
=
[]
for
page_num
in
range
(
len
(
doc
)):
page
=
doc
[
page_num
]
tmp
.
append
(
fitz_doc_to_image
(
page
))
images
.
append
(
tmp
)
return
(
idx
,
images
)
def
batch_build_dataset
(
pdf_paths
,
k
,
lang
=
None
):
"""Process multiple PDFs by partitioning them into k balanced parts and
processing each part in parallel.
Parameters:
-----------
pdf_paths : list
List of paths to PDF files
k : int
Number of partitions to create
output_dir : str or None
Directory to save images to
threads_per_worker : int
Number of threads to use per worker
**kwargs :
Additional arguments for process_pdf_page
Returns:
--------
all_images : list
List of all processed images
"""
results
=
[]
for
pdf_path
in
pdf_paths
:
with
open
(
pdf_path
,
'rb'
)
as
f
:
pdf_bytes
=
f
.
read
()
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
results
.
append
(
dataset
)
return
results
#
# # Get page counts for each PDF
# pdf_info = []
# total_pages = 0
#
# for pdf_path in pdf_paths:
# try:
# doc = fitz.open(pdf_path)
# num_pages = len(doc)
# pdf_info.append((pdf_path, num_pages))
# total_pages += num_pages
# doc.close()
# except Exception as e:
# print(f'Error opening {pdf_path}: {e}')
#
# # Partition the jobs based on page countEach job has 1 page
# partitions = partition_array_greedy(pdf_info, k)
#
# # Process each partition in parallel
# all_images_h = {}
#
# with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
# # Submit one task per partition
# futures = []
# for sn, partition in enumerate(partitions):
# # Get the jobs for this partition
# partition_jobs = [pdf_info[idx] for idx in partition]
#
# # Submit the task
# future = executor.submit(
# process_pdf_batch,
# partition_jobs,
# sn
# )
# futures.append(future)
# # Process results as they complete
# for i, future in enumerate(concurrent.futures.as_completed(futures)):
# try:
# idx, images = future.result()
# all_images_h[idx] = images
# except Exception as e:
# print(f'Error processing partition: {e}')
# results = [None] * len(pdf_paths)
# for i in range(len(partitions)):
# partition = partitions[i]
# for j in range(len(partition)):
# with open(pdf_info[partition[j]][0], 'rb') as f:
# pdf_bytes = f.read()
# dataset = PymuDocDataset(pdf_bytes, lang=lang)
# dataset.set_images(all_images_h[i][j])
# results[partition[j]] = dataset
# return results
\ No newline at end of file
magic_pdf/data/data_reader_writer/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.data.data_reader_writer.filebase
import
\
FileBasedDataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.filebase
import
\
FileBasedDataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.multi_bucket_s3
import
\
MultiBucketS3DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.multi_bucket_s3
import
\
MultiBucketS3DataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.s3
import
S3DataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.base
import
DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.base
import
DataWriter
# noqa: F401
\ No newline at end of file
magic_pdf/data/dataset.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
from
abc
import
ABC
,
abstractmethod
from
typing
import
Callable
,
Iterator
import
fitz
from
loguru
import
logger
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.schemas
import
PageInfo
from
magic_pdf.data.utils
import
fitz_doc_to_image
from
magic_pdf.filter
import
classify
class
PageableData
(
ABC
):
@
abstractmethod
def
get_image
(
self
)
->
dict
:
"""Transform data to image."""
pass
@
abstractmethod
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc page."""
pass
@
abstractmethod
def
get_page_info
(
self
)
->
PageInfo
:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
pass
@
abstractmethod
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
pass
@
abstractmethod
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
"""
pass
class
Dataset
(
ABC
):
@
abstractmethod
def
__len__
(
self
)
->
int
:
"""The length of the dataset."""
pass
@
abstractmethod
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page data."""
pass
@
abstractmethod
def
supported_methods
(
self
)
->
list
[
SupportedPdfParseMethod
]:
"""The methods that this dataset support.
Returns:
list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
"""
pass
@
abstractmethod
def
data_bits
(
self
)
->
bytes
:
"""The bits used to create this dataset."""
pass
@
abstractmethod
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""Get the page indexed by page_id.
Args:
page_id (int): the index of the page
Returns:
PageableData: the page doc object
"""
pass
@
abstractmethod
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
pass
@
abstractmethod
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
@
abstractmethod
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
pass
@
abstractmethod
def
clone
(
self
):
"""clone this dataset."""
pass
class
PymuDocDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
,
lang
=
None
):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the pdf
"""
self
.
_raw_fitz
=
fitz
.
open
(
'pdf'
,
bits
)
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_data_bits
=
bits
self
.
_raw_data
=
bits
self
.
_classify_result
=
None
if
lang
==
''
:
self
.
_lang
=
None
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
self
.
_data_bits
)
logger
.
info
(
f
'lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
self
.
_lang
=
lang
logger
.
info
(
f
'lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
"""The page number of the pdf."""
return
len
(
self
.
_records
)
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page doc object."""
return
iter
(
self
.
_records
)
def
supported_methods
(
self
)
->
list
[
SupportedPdfParseMethod
]:
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return
[
SupportedPdfParseMethod
.
OCR
,
SupportedPdfParseMethod
.
TXT
]
def
data_bits
(
self
)
->
bytes
:
"""The pdf bits used to create this dataset."""
return
self
.
_data_bits
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
if
'lang'
in
kwargs
and
self
.
_lang
is
not
None
:
kwargs
[
'lang'
]
=
self
.
_lang
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
if
self
.
_classify_result
is
None
:
self
.
_classify_result
=
classify
(
self
.
_data_bits
)
return
self
.
_classify_result
def
clone
(
self
):
"""clone this dataset."""
return
PymuDocDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
for
i
in
range
(
len
(
self
.
_records
)):
self
.
_records
[
i
].
set_image
(
images
[
i
])
class
ImageDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
,
lang
=
None
):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
"""
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
self
.
_raw_fitz
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_raw_data
=
bits
self
.
_data_bits
=
pdf_bytes
if
lang
==
''
:
self
.
_lang
=
None
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
self
.
_data_bits
)
logger
.
info
(
f
'lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
self
.
_lang
=
lang
logger
.
info
(
f
'lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
"""The length of the dataset."""
return
len
(
self
.
_records
)
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page object."""
return
iter
(
self
.
_records
)
def
supported_methods
(
self
):
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return
[
SupportedPdfParseMethod
.
OCR
]
def
data_bits
(
self
)
->
bytes
:
"""The pdf bits used to create this dataset."""
return
self
.
_data_bits
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
return
SupportedPdfParseMethod
.
OCR
def
clone
(
self
):
"""clone this dataset."""
return
ImageDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
for
i
in
range
(
len
(
self
.
_records
)):
self
.
_records
[
i
].
set_image
(
images
[
i
])
class
Doc
(
PageableData
):
"""Initialized with pymudoc object."""
def
__init__
(
self
,
doc
:
fitz
.
Page
):
self
.
_doc
=
doc
self
.
_img
=
None
def
get_image
(
self
):
"""Return the image info.
Returns:
dict: {
img: np.ndarray,
width: int,
height: int
}
"""
if
self
.
_img
is
None
:
self
.
_img
=
fitz_doc_to_image
(
self
.
_doc
)
return
self
.
_img
def
set_image
(
self
,
img
):
"""
Args:
img (np.ndarray): the image
"""
if
self
.
_img
is
None
:
self
.
_img
=
img
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc object.
Returns:
fitz.Page: the pymudoc object
"""
return
self
.
_doc
def
get_page_info
(
self
)
->
PageInfo
:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
page_w
=
self
.
_doc
.
rect
.
width
page_h
=
self
.
_doc
.
rect
.
height
return
PageInfo
(
w
=
page_w
,
h
=
page_h
)
def
__getattr__
(
self
,
name
):
if
hasattr
(
self
.
_doc
,
name
):
return
getattr
(
self
.
_doc
,
name
)
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
self
.
_doc
.
draw_rect
(
rect_coords
,
color
=
color
,
fill
=
fill
,
fill_opacity
=
fill_opacity
,
width
=
width
,
overlay
=
overlay
,
)
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
"""
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
\ No newline at end of file
magic_pdf/data/io/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.data.io.base
import
IOReader
,
IOWriter
# noqa: F401
from
magic_pdf.data.io.http
import
HttpReader
,
HttpWriter
# noqa: F401
from
magic_pdf.data.io.s3
import
S3Reader
,
S3Writer
# noqa: F401
__all__
=
[
'IOReader'
,
'IOWriter'
,
'HttpReader'
,
'HttpWriter'
,
'S3Reader'
,
'S3Writer'
]
\ No newline at end of file
magic_pdf/data/read_api.py
deleted
100644 → 0
View file @
3bd0ecf1
import
json
import
os
import
tempfile
import
shutil
from
pathlib
import
Path
from
magic_pdf.config.exceptions
import
EmptyData
,
InvalidParams
from
magic_pdf.data.data_reader_writer
import
(
FileBasedDataReader
,
MultiBucketS3DataReader
)
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.utils.office_to_pdf
import
convert_file_to_pdf
,
ConvertToPdfError
def
read_jsonl
(
s3_path_or_local
:
str
,
s3_client
:
MultiBucketS3DataReader
|
None
=
None
)
->
list
[
PymuDocDataset
]:
"""Read the jsonl file and return the list of PymuDocDataset.
Args:
s3_path_or_local (str): local file or s3 path
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
Raises:
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
EmptyData: if no pdf file location is provided in some line of jsonl file.
InvalidParams: if the file location is s3 path but s3_client is not provided
Returns:
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
"""
bits_arr
=
[]
if
s3_path_or_local
.
startswith
(
's3://'
):
if
s3_client
is
None
:
raise
InvalidParams
(
's3_client is required when s3_path is provided'
)
jsonl_bits
=
s3_client
.
read
(
s3_path_or_local
)
else
:
jsonl_bits
=
FileBasedDataReader
(
''
).
read
(
s3_path_or_local
)
jsonl_d
=
[
json
.
loads
(
line
)
for
line
in
jsonl_bits
.
decode
().
split
(
'
\n
'
)
if
line
.
strip
()
]
for
d
in
jsonl_d
:
pdf_path
=
d
.
get
(
'file_location'
,
''
)
or
d
.
get
(
'path'
,
''
)
if
len
(
pdf_path
)
==
0
:
raise
EmptyData
(
'pdf file location is empty'
)
if
pdf_path
.
startswith
(
's3://'
):
if
s3_client
is
None
:
raise
InvalidParams
(
's3_client is required when s3_path is provided'
)
bits_arr
.
append
(
s3_client
.
read
(
pdf_path
))
else
:
bits_arr
.
append
(
FileBasedDataReader
(
''
).
read
(
pdf_path
))
return
[
PymuDocDataset
(
bits
)
for
bits
in
bits_arr
]
def
read_local_pdfs
(
path
:
str
)
->
list
[
PymuDocDataset
]:
"""Read pdf from path or directory.
Args:
path (str): pdf file path or directory that contains pdf files
Returns:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
"""
if
os
.
path
.
isdir
(
path
):
reader
=
FileBasedDataReader
()
ret
=
[]
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
file
.
split
(
'.'
)
if
suffix
[
-
1
]
==
'pdf'
:
ret
.
append
(
PymuDocDataset
(
reader
.
read
(
os
.
path
.
join
(
root
,
file
))))
return
ret
else
:
reader
=
FileBasedDataReader
()
bits
=
reader
.
read
(
path
)
return
[
PymuDocDataset
(
bits
)]
def
read_local_office
(
path
:
str
)
->
list
[
PymuDocDataset
]:
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
Args:
path (str): ms-office file or directory that contains ms-office files
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
Raises:
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes
=
[
'.ppt'
,
'.pptx'
,
'.doc'
,
'.docx'
]
fns
=
[]
ret
=
[]
if
os
.
path
.
isdir
(
path
):
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
Path
(
file
).
suffix
if
suffix
in
suffixes
:
fns
.
append
((
os
.
path
.
join
(
root
,
file
)))
else
:
fns
.
append
(
path
)
reader
=
FileBasedDataReader
()
temp_dir
=
tempfile
.
mkdtemp
()
for
fn
in
fns
:
try
:
convert_file_to_pdf
(
fn
,
temp_dir
)
except
ConvertToPdfError
as
e
:
raise
e
except
FileNotFoundError
as
e
:
raise
e
except
Exception
as
e
:
raise
e
fn_path
=
Path
(
fn
)
pdf_fn
=
f
"
{
temp_dir
}
/
{
fn_path
.
stem
}
.pdf"
ret
.
append
(
PymuDocDataset
(
reader
.
read
(
pdf_fn
)))
shutil
.
rmtree
(
temp_dir
)
return
ret
def
read_local_images
(
path
:
str
,
suffixes
:
list
[
str
]
=
[
'.png'
,
'.jpg'
,
'.jpeg'
])
->
list
[
ImageDataset
]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
"""
if
os
.
path
.
isdir
(
path
):
imgs_bits
=
[]
s_suffixes
=
set
(
suffixes
)
reader
=
FileBasedDataReader
()
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
Path
(
file
).
suffix
if
suffix
in
s_suffixes
:
imgs_bits
.
append
(
reader
.
read
(
os
.
path
.
join
(
root
,
file
)))
return
[
ImageDataset
(
bits
)
for
bits
in
imgs_bits
]
else
:
reader
=
FileBasedDataReader
()
bits
=
reader
.
read
(
path
)
return
[
ImageDataset
(
bits
)]
magic_pdf/data/utils.py
deleted
100644 → 0
View file @
3bd0ecf1
import
multiprocessing
as
mp
import
threading
from
concurrent.futures
import
(
ProcessPoolExecutor
,
ThreadPoolExecutor
,
as_completed
)
import
fitz
import
numpy
as
np
from
loguru
import
logger
def
fitz_doc_to_image
(
page
,
dpi
=
200
)
->
dict
:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
page (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 4500 after scaling, do not scale further.
if
pm
.
width
>
4500
or
pm
.
height
>
4500
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# Convert pixmap samples directly to numpy array
img
=
np
.
frombuffer
(
pm
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pm
.
height
,
pm
.
width
,
3
)
img_dict
=
{
'img'
:
img
,
'width'
:
pm
.
width
,
'height'
:
pm
.
height
}
return
img_dict
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
,
start_page_id
=
0
,
end_page_id
=
None
)
->
list
:
images
=
[]
with
fitz
.
open
(
'pdf'
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
)
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
'end_page_id is out of range, use images length'
)
end_page_id
=
pdf_page_num
-
1
for
index
in
range
(
0
,
doc
.
page_count
):
if
start_page_id
<=
index
<=
end_page_id
:
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 4500 after scaling, do not scale further.
if
pm
.
width
>
4500
or
pm
.
height
>
4500
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# Convert pixmap samples directly to numpy array
img
=
np
.
frombuffer
(
pm
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pm
.
height
,
pm
.
width
,
3
)
img_dict
=
{
'img'
:
img
,
'width'
:
pm
.
width
,
'height'
:
pm
.
height
}
else
:
img_dict
=
{
'img'
:
[],
'width'
:
0
,
'height'
:
0
}
images
.
append
(
img_dict
)
return
images
def
convert_page
(
bytes_page
):
pdfs
=
fitz
.
open
(
'pdf'
,
bytes_page
)
page
=
pdfs
[
0
]
return
fitz_doc_to_image
(
page
)
def
parallel_process_pdf_safe
(
pages
,
num_workers
=
None
,
**
kwargs
):
"""Process PDF pages in parallel with serialization-safe approach."""
if
num_workers
is
None
:
num_workers
=
mp
.
cpu_count
()
# Process the extracted page data in parallel
with
ProcessPoolExecutor
(
max_workers
=
num_workers
)
as
executor
:
# Process the page data
results
=
list
(
executor
.
map
(
convert_page
,
pages
)
)
return
results
def
threaded_process_pdf
(
pdf_path
,
num_threads
=
4
,
**
kwargs
):
"""Process all pages of a PDF using multiple threads.
Parameters:
-----------
pdf_path : str
Path to the PDF file
num_threads : int
Number of threads to use
**kwargs :
Additional arguments for fitz_doc_to_image
Returns:
--------
images : list
List of processed images, in page order
"""
# Open the PDF
doc
=
fitz
.
open
(
pdf_path
)
num_pages
=
len
(
doc
)
# Create a list to store results in the correct order
results
=
[
None
]
*
num_pages
# Create a thread pool
with
ThreadPoolExecutor
(
max_workers
=
num_threads
)
as
executor
:
# Submit all tasks
futures
=
{}
for
page_num
in
range
(
num_pages
):
page
=
doc
[
page_num
]
future
=
executor
.
submit
(
fitz_doc_to_image
,
page
,
**
kwargs
)
futures
[
future
]
=
page_num
# Process results as they complete with progress bar
for
future
in
as_completed
(
futures
):
page_num
=
futures
[
future
]
try
:
results
[
page_num
]
=
future
.
result
()
except
Exception
as
e
:
print
(
f
'Error processing page
{
page_num
}
:
{
e
}
'
)
results
[
page_num
]
=
None
# Close the document
doc
.
close
()
if
__name__
==
'__main__'
:
pdf
=
fitz
.
open
(
'/tmp/[MS-DOC].pdf'
)
pdf_page
=
[
fitz
.
open
()
for
i
in
range
(
pdf
.
page_count
)]
[
pdf_page
[
i
].
insert_pdf
(
pdf
,
from_page
=
i
,
to_page
=
i
)
for
i
in
range
(
pdf
.
page_count
)]
pdf_page
=
[
v
.
tobytes
()
for
v
in
pdf_page
]
results
=
parallel_process_pdf_safe
(
pdf_page
,
num_workers
=
16
)
# threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
""" benchmark results of multi-threaded processing (fitz page to image)
total page nums: 578
thread nums, time cost
1 7.351 sec
2 6.334 sec
4 5.968 sec
8 6.728 sec
16 8.085 sec
"""
""" benchmark results of multi-processor processing (fitz page to image)
total page nums: 578
processor nums, time cost
1 17.170 sec
2 10.170 sec
4 7.841 sec
8 7.900 sec
16 7.984 sec
"""
magic_pdf/filter/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.filter.pdf_classify_by_type
import
classify
as
do_classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
def
classify
(
pdf_bytes
:
bytes
)
->
SupportedPdfParseMethod
:
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
pdf_meta
=
pdf_meta_scan
(
pdf_bytes
)
if
pdf_meta
.
get
(
'_need_drop'
,
False
):
# 如果返回了需要丢弃的标志,则抛出异常
raise
Exception
(
f
"pdf meta_scan need_drop,reason is
{
pdf_meta
[
'_drop_reason'
]
}
"
)
else
:
is_encrypted
=
pdf_meta
[
'is_encrypted'
]
is_needs_password
=
pdf_meta
[
'is_needs_password'
]
if
is_encrypted
or
is_needs_password
:
# 加密的,需要密码的,没有页面的,都不处理
raise
Exception
(
f
'pdf meta_scan need_drop,reason is
{
DropReason
.
ENCRYPTED
}
'
)
else
:
is_text_pdf
,
results
=
do_classify
(
pdf_meta
[
'total_page'
],
pdf_meta
[
'page_width_pts'
],
pdf_meta
[
'page_height_pts'
],
pdf_meta
[
'image_info_per_page'
],
pdf_meta
[
'text_len_per_page'
],
pdf_meta
[
'imgs_per_page'
],
# pdf_meta['text_layout_per_page'],
pdf_meta
[
'invalid_chars'
],
)
if
is_text_pdf
:
return
SupportedPdfParseMethod
.
TXT
else
:
return
SupportedPdfParseMethod
.
OCR
magic_pdf/filter/pdf_classify_by_type.py
deleted
100644 → 0
View file @
3bd0ecf1
This diff is collapsed.
Click to expand it.
magic_pdf/filter/pdf_meta_scan.py
deleted
100644 → 0
View file @
3bd0ecf1
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
from
collections
import
Counter
import
fitz
from
loguru
import
logger
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.libs.commons
import
get_top_percent_list
,
mymax
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.pdf_check
import
detect_invalid_chars_by_pymupdf
,
detect_invalid_chars
scan_max_page
=
50
junk_limit_min
=
10
def
calculate_max_image_area_per_page
(
result
:
list
,
page_width_pts
,
page_height_pts
):
max_image_area_per_page
=
[
mymax
([(
x1
-
x0
)
*
(
y1
-
y0
)
for
x0
,
y0
,
x1
,
y1
,
_
in
page_img_sz
])
for
page_img_sz
in
result
]
page_area
=
int
(
page_width_pts
)
*
int
(
page_height_pts
)
max_image_area_per_page
=
[
area
/
page_area
for
area
in
max_image_area_per_page
]
max_image_area_per_page
=
[
area
for
area
in
max_image_area_per_page
if
area
>
0.6
]
return
max_image_area_per_page
def
process_image
(
page
,
junk_img_bojids
=
[]):
page_result
=
[]
# 存每个页面里的多张图四元组信息
items
=
page
.
get_images
()
dedup
=
set
()
for
img
in
items
:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid
=
img
[
0
]
# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if
img_bojid
in
junk_img_bojids
:
# 如果是垃圾图像,就跳过
continue
recs
=
page
.
get_image_rects
(
img
,
transform
=
True
)
if
recs
:
rec
=
recs
[
0
][
0
]
x0
,
y0
,
x1
,
y1
=
map
(
int
,
rec
)
width
=
x1
-
x0
height
=
y1
-
y0
if
(
x0
,
y0
,
x1
,
y1
,
img_bojid
,
)
in
dedup
:
# 这里面会出现一些重复的bbox,无需重复出现,需要去掉
continue
if
not
all
(
[
width
,
height
]
):
# 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
continue
dedup
.
add
((
x0
,
y0
,
x1
,
y1
,
img_bojid
))
page_result
.
append
([
x0
,
y0
,
x1
,
y1
,
img_bojid
])
return
page_result
def
get_image_info
(
doc
:
fitz
.
Document
,
page_width_pts
,
page_height_pts
)
->
list
:
"""返回每个页面里的图片的四元组,每个页面多个图片。
:param doc:
:return:
"""
# 使用 Counter 计数 img_bojid 的出现次数
img_bojid_counter
=
Counter
(
img
[
0
]
for
page
in
doc
for
img
in
page
.
get_images
())
# 找出出现次数超过 len(doc) 半数的 img_bojid
junk_limit
=
max
(
len
(
doc
)
*
0.5
,
junk_limit_min
)
# 对一些页数比较少的进行豁免
junk_img_bojids
=
[
img_bojid
for
img_bojid
,
count
in
img_bojid_counter
.
items
()
if
count
>=
junk_limit
]
# todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
# 有两种扫描版,一种文字版,这里可能会有误判
# 扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
# 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
# 文 字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
imgs_len_list
=
[
len
(
page
.
get_images
())
for
page
in
doc
]
special_limit_pages
=
10
# 统一用前十页结果做判断
result
=
[]
break_loop
=
False
for
i
,
page
in
enumerate
(
doc
):
if
break_loop
:
break
if
i
>=
special_limit_pages
:
break
page_result
=
process_image
(
page
)
# 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
result
.
append
(
page_result
)
for
item
in
result
:
if
not
any
(
item
):
# 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if
(
max
(
imgs_len_list
)
==
min
(
imgs_len_list
)
and
max
(
imgs_len_list
)
>=
junk_limit_min
):
# 如果是特殊文字版,就把junklist置空并break
junk_img_bojids
=
[]
else
:
# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
pass
break_loop
=
True
break
if
not
break_loop
:
# 获取前80%的元素
top_eighty_percent
=
get_top_percent_list
(
imgs_len_list
,
0.8
)
# 检查前80%的元素是否都相等
if
len
(
set
(
top_eighty_percent
))
==
1
and
max
(
imgs_len_list
)
>=
junk_limit_min
:
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
# 前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page
=
calculate_max_image_area_per_page
(
result
,
page_width_pts
,
page_height_pts
)
if
(
len
(
max_image_area_per_page
)
<
0.8
*
special_limit_pages
):
# 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
junk_img_bojids
=
[]
else
:
# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
pass
else
:
# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
junk_img_bojids
=
[]
# 正式进入取前50页图片的信息流程
result
=
[]
for
i
,
page
in
enumerate
(
doc
):
if
i
>=
scan_max_page
:
break
page_result
=
process_image
(
page
,
junk_img_bojids
)
# logger.info(f"page {i} img_len: {len(page_result)}")
result
.
append
(
page_result
)
return
result
,
junk_img_bojids
def
get_pdf_page_size_pts
(
doc
:
fitz
.
Document
):
page_cnt
=
len
(
doc
)
l
:
int
=
min
(
page_cnt
,
50
)
# 把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
page_width_list
=
[]
page_height_list
=
[]
for
i
in
range
(
l
):
page
=
doc
[
i
]
page_rect
=
page
.
rect
page_width_list
.
append
(
page_rect
.
width
)
page_height_list
.
append
(
page_rect
.
height
)
page_width_list
.
sort
()
page_height_list
.
sort
()
median_width
=
page_width_list
[
len
(
page_width_list
)
//
2
]
median_height
=
page_height_list
[
len
(
page_height_list
)
//
2
]
return
median_width
,
median_height
def
get_pdf_textlen_per_page
(
doc
:
fitz
.
Document
):
text_len_lst
=
[]
for
page
in
doc
:
# 拿包含img和text的所有blocks
# text_block = page.get_text("blocks")
# 拿所有text的blocks
# text_block = page.get_text("words")
# text_block_len = sum([len(t[4]) for t in text_block])
# 拿所有text的str
text_block
=
page
.
get_text
(
'text'
)
text_block_len
=
len
(
text_block
)
# logger.info(f"page {page.number} text_block_len: {text_block_len}")
text_len_lst
.
append
(
text_block_len
)
return
text_len_lst
def
get_pdf_text_layout_per_page
(
doc
:
fitz
.
Document
):
"""根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
List[str]: 每一页的文本布局(横向、纵向、未知)。
"""
text_layout_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
if
page_id
>=
scan_max_page
:
break
# 创建每一页的纵向和横向的文本行数计数器
vertical_count
=
0
horizontal_count
=
0
text_dict
=
page
.
get_text
(
'dict'
)
if
'blocks'
in
text_dict
:
for
block
in
text_dict
[
'blocks'
]:
if
'lines'
in
block
:
for
line
in
block
[
'lines'
]:
# 获取line的bbox顶点坐标
x0
,
y0
,
x1
,
y1
=
line
[
'bbox'
]
# 计算bbox的宽高
width
=
x1
-
x0
height
=
y1
-
y0
# 计算bbox的面积
area
=
width
*
height
font_sizes
=
[]
for
span
in
line
[
'spans'
]:
if
'size'
in
span
:
font_sizes
.
append
(
span
[
'size'
])
if
len
(
font_sizes
)
>
0
:
average_font_size
=
sum
(
font_sizes
)
/
len
(
font_sizes
)
else
:
average_font_size
=
(
10
# 有的line拿不到font_size,先定一个阈值100
)
if
(
area
<=
average_font_size
**
2
):
# 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
continue
else
:
if
'wmode'
in
line
:
# 通过wmode判断文本方向
if
line
[
'wmode'
]
==
1
:
# 判断是否为竖向文本
vertical_count
+=
1
elif
line
[
'wmode'
]
==
0
:
# 判断是否为横向文本
horizontal_count
+=
1
# if 'dir' in line: # 通过旋转角度计算判断文本方向
# # 获取行的 "dir" 值
# dir_value = line['dir']
# cosine, sine = dir_value
# # 计算角度
# angle = math.degrees(math.acos(cosine))
#
# # 判断是否为横向文本
# if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
# # line_text = ' '.join(span['text'] for span in line['spans'])
# # print('This line is horizontal:', line_text)
# horizontal_count += 1
# # 判断是否为纵向文本
# elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
# # line_text = ' '.join(span['text'] for span in line['spans'])
# # print('This line is vertical:', line_text)
# vertical_count += 1
# print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
# 判断每一页的文本布局
if
vertical_count
==
0
and
horizontal_count
==
0
:
# 该页没有文本,无法判断
text_layout_list
.
append
(
'unknow'
)
continue
else
:
if
vertical_count
>
horizontal_count
:
# 该页的文本纵向行数大于横向的
text_layout_list
.
append
(
'vertical'
)
else
:
# 该页的文本横向行数大于纵向的
text_layout_list
.
append
(
'horizontal'
)
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return
text_layout_list
"""定义一个自定义异常用来抛出单页svg太多的pdf"""
class
PageSvgsTooManyError
(
Exception
):
def
__init__
(
self
,
message
=
'Page SVGs are too many'
):
self
.
message
=
message
super
().
__init__
(
self
.
message
)
def
get_svgs_per_page
(
doc
:
fitz
.
Document
):
svgs_len_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
# svgs = page.get_drawings()
svgs
=
page
.
get_cdrawings
()
# 切换成get_cdrawings,效率更高
len_svgs
=
len
(
svgs
)
if
len_svgs
>=
3000
:
raise
PageSvgsTooManyError
()
else
:
svgs_len_list
.
append
(
len_svgs
)
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
return
svgs_len_list
def
get_imgs_per_page
(
doc
:
fitz
.
Document
):
imgs_len_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
imgs
=
page
.
get_images
()
imgs_len_list
.
append
(
len
(
imgs
))
# logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
return
imgs_len_list
def
get_language
(
doc
:
fitz
.
Document
):
"""
获取PDF文档的语言。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
str: 文档语言,如 "en-US"。
"""
language_lst
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
if
page_id
>=
scan_max_page
:
break
# 拿所有text的str
text_block
=
page
.
get_text
(
'text'
)
page_language
=
detect_lang
(
text_block
)
language_lst
.
append
(
page_language
)
# logger.info(f"page_id: {page_id}, page_language: {page_language}")
# 统计text_language_list中每种语言的个数
count_dict
=
Counter
(
language_lst
)
# 输出text_language_list中出现的次数最多的语言
language
=
max
(
count_dict
,
key
=
count_dict
.
get
)
return
language
def
check_invalid_chars
(
pdf_bytes
):
"""乱码检测."""
# return detect_invalid_chars_by_pymupdf(pdf_bytes)
return
detect_invalid_chars
(
pdf_bytes
)
def
pdf_meta_scan
(
pdf_bytes
:
bytes
):
"""
:param s3_pdf_path:
:param pdf_bytes: pdf文件的二进制数据
几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取
"""
doc
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
is_needs_password
=
doc
.
needs_pass
is_encrypted
=
doc
.
is_encrypted
total_page
=
len
(
doc
)
if
total_page
==
0
:
logger
.
warning
(
f
'drop this pdf, drop_reason:
{
DropReason
.
EMPTY_PDF
}
'
)
result
=
{
'_need_drop'
:
True
,
'_drop_reason'
:
DropReason
.
EMPTY_PDF
}
return
result
else
:
page_width_pts
,
page_height_pts
=
get_pdf_page_size_pts
(
doc
)
# logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
# svgs_per_page = get_svgs_per_page(doc)
# logger.info(f"svgs_per_page: {svgs_per_page}")
imgs_per_page
=
get_imgs_per_page
(
doc
)
# logger.info(f"imgs_per_page: {imgs_per_page}")
image_info_per_page
,
junk_img_bojids
=
get_image_info
(
doc
,
page_width_pts
,
page_height_pts
)
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page
=
get_pdf_textlen_per_page
(
doc
)
# logger.info(f"text_len_per_page: {text_len_per_page}")
# text_layout_per_page = get_pdf_text_layout_per_page(doc)
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
# text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")
invalid_chars
=
check_invalid_chars
(
pdf_bytes
)
# logger.info(f"invalid_chars: {invalid_chars}")
# 最后输出一条json
res
=
{
'is_needs_password'
:
is_needs_password
,
'is_encrypted'
:
is_encrypted
,
'total_page'
:
total_page
,
'page_width_pts'
:
int
(
page_width_pts
),
'page_height_pts'
:
int
(
page_height_pts
),
'image_info_per_page'
:
image_info_per_page
,
'text_len_per_page'
:
text_len_per_page
,
# 'text_layout_per_page': text_layout_per_page,
# 'text_language': text_language,
# "svgs_per_page": svgs_per_page,
'imgs_per_page'
:
imgs_per_page
,
# 增加每页img数量list
'junk_img_bojids'
:
junk_img_bojids
,
# 增加垃圾图片的bojid list
'invalid_chars'
:
invalid_chars
,
'metadata'
:
doc
.
metadata
,
}
# logger.info(json.dumps(res, ensure_ascii=False))
return
res
if
__name__
==
'__main__'
:
pass
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
# "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
# file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") # noqa: E501
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
# doc = fitz.open("pdf", file_content)
# text_layout_lst = get_pdf_text_layout_per_page(doc)
# print(text_layout_lst)
magic_pdf/integrations/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/integrations/rag/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/integrations/rag/api.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
from
pathlib
import
Path
from
loguru
import
logger
from
magic_pdf.integrations.rag.type
import
(
ElementRelation
,
LayoutElements
,
Node
)
from
magic_pdf.integrations.rag.utils
import
inference
class
RagPageReader
:
def
__init__
(
self
,
pagedata
:
LayoutElements
):
self
.
o
=
[
Node
(
category_type
=
v
.
category_type
,
text
=
v
.
text
,
image_path
=
v
.
image_path
,
anno_id
=
v
.
anno_id
,
latex
=
v
.
latex
,
html
=
v
.
html
,
)
for
v
in
pagedata
.
layout_dets
]
self
.
pagedata
=
pagedata
def
__iter__
(
self
):
return
iter
(
self
.
o
)
def
get_rel_map
(
self
)
->
list
[
ElementRelation
]:
return
self
.
pagedata
.
extra
.
element_relation
class
RagDocumentReader
:
def
__init__
(
self
,
ragdata
:
list
[
LayoutElements
]):
self
.
o
=
[
RagPageReader
(
v
)
for
v
in
ragdata
]
def
__iter__
(
self
):
return
iter
(
self
.
o
)
class
DataReader
:
def
__init__
(
self
,
path_or_directory
:
str
,
method
:
str
,
output_dir
:
str
):
self
.
path_or_directory
=
path_or_directory
self
.
method
=
method
self
.
output_dir
=
output_dir
self
.
pdfs
=
[]
if
os
.
path
.
isdir
(
path_or_directory
):
for
doc_path
in
Path
(
path_or_directory
).
glob
(
'*.pdf'
):
self
.
pdfs
.
append
(
doc_path
)
else
:
assert
path_or_directory
.
endswith
(
'.pdf'
)
self
.
pdfs
.
append
(
Path
(
path_or_directory
))
def
get_documents_count
(
self
)
->
int
:
"""Returns the number of documents in the directory."""
return
len
(
self
.
pdfs
)
def
get_document_result
(
self
,
idx
:
int
)
->
RagDocumentReader
|
None
:
"""
Args:
idx (int): the index of documents under the
directory path_or_directory
Returns:
RagDocumentReader | None: RagDocumentReader is an iterable object,
more details @RagDocumentReader
"""
if
idx
>=
self
.
get_documents_count
()
or
idx
<
0
:
logger
.
error
(
f
'invalid idx:
{
idx
}
'
)
return
None
res
=
inference
(
str
(
self
.
pdfs
[
idx
]),
self
.
output_dir
,
self
.
method
)
if
res
is
None
:
logger
.
warning
(
f
'failed to inference pdf
{
self
.
pdfs
[
idx
]
}
'
)
return
None
return
RagDocumentReader
(
res
)
def
get_document_filename
(
self
,
idx
:
int
)
->
Path
:
"""get the filename of the document."""
return
self
.
pdfs
[
idx
]
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment