Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
0c7a0882
"src/graph/vscode:/vscode.git/clone" did not exist on "30b89e6a8aed084e5f24debed5f3bf6f191899fd"
Unverified
Commit
0c7a0882
authored
Jun 12, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jun 12, 2025
Browse files
Merge pull request #2611 from myhloli/dev
Dev
parents
3bd0ecf1
a392f445
Changes
262
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2036 deletions
+0
-2036
.pre-commit-config.yaml
.pre-commit-config.yaml
+0
-47
magic_pdf/config/constants.py
magic_pdf/config/constants.py
+0
-60
magic_pdf/config/drop_reason.py
magic_pdf/config/drop_reason.py
+0
-35
magic_pdf/config/drop_tag.py
magic_pdf/config/drop_tag.py
+0
-19
magic_pdf/config/enums.py
magic_pdf/config/enums.py
+0
-7
magic_pdf/config/make_content_config.py
magic_pdf/config/make_content_config.py
+0
-11
magic_pdf/config/model_block_type.py
magic_pdf/config/model_block_type.py
+0
-10
magic_pdf/config/ocr_content_type.py
magic_pdf/config/ocr_content_type.py
+0
-40
magic_pdf/data/batch_build_dataset.py
magic_pdf/data/batch_build_dataset.py
+0
-167
magic_pdf/data/data_reader_writer/__init__.py
magic_pdf/data/data_reader_writer/__init__.py
+0
-12
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+0
-408
magic_pdf/data/io/__init__.py
magic_pdf/data/io/__init__.py
+0
-6
magic_pdf/data/read_api.py
magic_pdf/data/read_api.py
+0
-142
magic_pdf/data/utils.py
magic_pdf/data/utils.py
+0
-166
magic_pdf/filter/__init__.py
magic_pdf/filter/__init__.py
+0
-32
magic_pdf/filter/pdf_classify_by_type.py
magic_pdf/filter/pdf_classify_by_type.py
+0
-395
magic_pdf/filter/pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+0
-397
magic_pdf/integrations/__init__.py
magic_pdf/integrations/__init__.py
+0
-0
magic_pdf/integrations/rag/__init__.py
magic_pdf/integrations/rag/__init__.py
+0
-0
magic_pdf/integrations/rag/api.py
magic_pdf/integrations/rag/api.py
+0
-82
No files found.
.pre-commit-config.yaml
deleted
100644 → 0
View file @
3bd0ecf1
repos
:
-
repo
:
https://github.com/PyCQA/flake8
rev
:
5.0.4
hooks
:
-
id
:
flake8
args
:
[
"
--max-line-length=150"
,
"
--ignore=E131,E125,W503,W504,E203"
]
-
repo
:
https://github.com/PyCQA/isort
rev
:
5.11.5
hooks
:
-
id
:
isort
-
repo
:
https://github.com/pre-commit/mirrors-yapf
rev
:
v0.32.0
hooks
:
-
id
:
yapf
args
:
[
"
--style={based_on_style:
google,
column_limit:
150,
indent_width:
4}"
]
-
repo
:
https://github.com/codespell-project/codespell
rev
:
v2.2.1
hooks
:
-
id
:
codespell
args
:
[
'
--skip'
,
'
*.json'
]
-
repo
:
https://github.com/pre-commit/pre-commit-hooks
rev
:
v4.3.0
hooks
:
-
id
:
trailing-whitespace
-
id
:
check-yaml
-
id
:
end-of-file-fixer
-
id
:
requirements-txt-fixer
-
id
:
double-quote-string-fixer
-
id
:
check-merge-conflict
-
id
:
fix-encoding-pragma
args
:
[
"
--remove"
]
-
id
:
mixed-line-ending
args
:
[
"
--fix=lf"
]
-
repo
:
https://github.com/executablebooks/mdformat
rev
:
0.7.9
hooks
:
-
id
:
mdformat
args
:
[
"
--number"
,
"
--table-width"
,
"
200"
]
additional_dependencies
:
-
mdformat-openmmlab
-
mdformat_frontmatter
-
linkify-it-py
-
repo
:
https://github.com/myint/docformatter
rev
:
v1.3.1
hooks
:
-
id
:
docformatter
args
:
[
"
--in-place"
,
"
--wrap-descriptions"
,
"
119"
]
magic_pdf/config/constants.py
deleted
100644 → 0
View file @
3bd0ecf1
"""span维度自定义字段."""
# span是否是跨页合并的
CROSS_PAGE
=
'cross_page'
"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED
=
'lines_deleted'
# table recognition max time default value
TABLE_MAX_TIME_VALUE
=
400
# pp_table_result_max_length
TABLE_MAX_LEN
=
480
# table master structure dict
TABLE_MASTER_DICT
=
'table_master_structure_dict.txt'
# table master dir
TABLE_MASTER_DIR
=
'table_structure_tablemaster_infer/'
# pp detect model dir
DETECT_MODEL_DIR
=
'ch_PP-OCRv4_det_infer'
# pp rec model dir
REC_MODEL_DIR
=
'ch_PP-OCRv4_rec_infer'
# pp rec char dict path
REC_CHAR_DICT
=
'ppocr_keys_v1.txt'
# pp rec copy rec directory
PP_REC_DIRECTORY
=
'.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
# pp rec copy det directory
PP_DET_DIRECTORY
=
'.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
class
MODEL_NAME
:
# pp table structure algorithm
TABLE_MASTER
=
'tablemaster'
# struct eqtable
STRUCT_EQTABLE
=
'struct_eqtable'
DocLayout_YOLO
=
'doclayout_yolo'
LAYOUTLMv3
=
'layoutlmv3'
YOLO_V8_MFD
=
'yolo_v8_mfd'
UniMerNet_v2_Small
=
'unimernet_small'
RAPID_TABLE
=
'rapid_table'
YOLO_V11_LangDetect
=
'yolo_v11n_langdetect'
PARSE_TYPE_TXT
=
'txt'
PARSE_TYPE_OCR
=
'ocr'
magic_pdf/config/drop_reason.py
deleted
100644 → 0
View file @
3bd0ecf1
class
DropReason
:
TEXT_BLCOK_HOR_OVERLAP
=
'text_block_horizontal_overlap'
# 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP
=
(
'useful_block_horizontal_overlap'
# 需保留的block水平覆盖
)
COMPLICATED_LAYOUT
=
'complicated_layout'
# 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS
=
'too_many_layout_columns'
# 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX
=
'color_background_text_box'
# 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS
=
(
'high_computational_load_by_imgs'
# 含特殊图片,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_SVGS
=
(
'high_computational_load_by_svgs'
# 特殊的SVG图,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
=
'high_computational_load_by_total_pages'
# 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT
=
'missing doc_layout_result'
# 版面分析失败
Exception
=
'_exception'
# 解析中发生异常
ENCRYPTED
=
'encrypted'
# PDF是加密的
EMPTY_PDF
=
'total_page=0'
# PDF页面总数为0
NOT_IS_TEXT_PDF
=
'not_is_text_pdf'
# 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK
=
'dense_single_line_block'
# 无法清晰的分段
TITLE_DETECTION_FAILED
=
'title_detection_failed'
# 探测标题失败
TITLE_LEVEL_FAILED
=
(
'title_level_failed'
# 分析标题级别失败(例如一级、二级、三级标题)
)
PARA_SPLIT_FAILED
=
'para_split_failed'
# 识别段落失败
PARA_MERGE_FAILED
=
'para_merge_failed'
# 段落合并失败
NOT_ALLOW_LANGUAGE
=
'not_allow_language'
# 不支持的语种
SPECIAL_PDF
=
'special_pdf'
PSEUDO_SINGLE_COLUMN
=
'pseudo_single_column'
# 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT
=
'can_not_detect_page_layout'
# 无法分析页面的版面
NEGATIVE_BBOX_AREA
=
'negative_bbox_area'
# 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION
=
(
'overlap_blocks_can_t_separation'
# 无法分离重叠的block
)
magic_pdf/config/drop_tag.py
deleted
100644 → 0
View file @
3bd0ecf1
COLOR_BG_HEADER_TXT_BLOCK
=
'color_background_header_txt_block'
PAGE_NO
=
'page-no'
# 页码
CONTENT_IN_FOOT_OR_HEADER
=
'in-foot-header-area'
# 页眉页脚内的文本
VERTICAL_TEXT
=
'vertical-text'
# 垂直文本
ROTATE_TEXT
=
'rotate-text'
# 旋转文本
EMPTY_SIDE_BLOCK
=
'empty-side-block'
# 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT
=
'on-image-text'
# 文本在图片上
ON_TABLE_TEXT
=
'on-table-text'
# 文本在表格上
class
DropTag
:
PAGE_NUMBER
=
'page_no'
HEADER
=
'header'
FOOTER
=
'footer'
FOOTNOTE
=
'footnote'
NOT_IN_LAYOUT
=
'not_in_layout'
SPAN_OVERLAP
=
'span_overlap'
BLOCK_OVERLAP
=
'block_overlap'
magic_pdf/config/enums.py
deleted
100644 → 0
View file @
3bd0ecf1
import
enum
class
SupportedPdfParseMethod
(
enum
.
Enum
):
OCR
=
'ocr'
TXT
=
'txt'
magic_pdf/config/make_content_config.py
deleted
100644 → 0
View file @
3bd0ecf1
class
MakeMode
:
MM_MD
=
'mm_markdown'
NLP_MD
=
'nlp_markdown'
STANDARD_FORMAT
=
'standard_format'
class
DropMode
:
WHOLE_PDF
=
'whole_pdf'
SINGLE_PAGE
=
'single_page'
NONE
=
'none'
NONE_WITH_REASON
=
'none_with_reason'
magic_pdf/config/model_block_type.py
deleted
100644 → 0
View file @
3bd0ecf1
from
enum
import
Enum
class
ModelBlockTypeEnum
(
Enum
):
TITLE
=
0
PLAIN_TEXT
=
1
ABANDON
=
2
ISOLATE_FORMULA
=
8
EMBEDDING
=
13
ISOLATED
=
14
magic_pdf/config/ocr_content_type.py
deleted
100644 → 0
View file @
3bd0ecf1
class
ContentType
:
Image
=
'image'
Table
=
'table'
Text
=
'text'
InlineEquation
=
'inline_equation'
InterlineEquation
=
'interline_equation'
class
BlockType
:
Image
=
'image'
ImageBody
=
'image_body'
ImageCaption
=
'image_caption'
ImageFootnote
=
'image_footnote'
Table
=
'table'
TableBody
=
'table_body'
TableCaption
=
'table_caption'
TableFootnote
=
'table_footnote'
Text
=
'text'
Title
=
'title'
InterlineEquation
=
'interline_equation'
Footnote
=
'footnote'
Discarded
=
'discarded'
List
=
'list'
Index
=
'index'
class
CategoryId
:
Title
=
0
Text
=
1
Abandon
=
2
ImageBody
=
3
ImageCaption
=
4
TableBody
=
5
TableCaption
=
6
TableFootnote
=
7
InterlineEquation_Layout
=
8
InlineEquation
=
13
InterlineEquation_YOLO
=
14
OcrText
=
15
ImageFootnote
=
101
magic_pdf/data/batch_build_dataset.py
deleted
100644 → 0
View file @
3bd0ecf1
import
concurrent.futures
import
fitz
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.utils
import
fitz_doc_to_image
# PyMuPDF
def
partition_array_greedy
(
arr
,
k
):
"""Partition an array into k parts using a simple greedy approach.
Parameters:
-----------
arr : list
The input array of integers
k : int
Number of partitions to create
Returns:
--------
partitions : list of lists
The k partitions of the array
"""
# Handle edge cases
if
k
<=
0
:
raise
ValueError
(
'k must be a positive integer'
)
if
k
>
len
(
arr
):
k
=
len
(
arr
)
# Adjust k if it's too large
if
k
==
1
:
return
[
list
(
range
(
len
(
arr
)))]
if
k
==
len
(
arr
):
return
[[
i
]
for
i
in
range
(
len
(
arr
))]
# Sort the array in descending order
sorted_indices
=
sorted
(
range
(
len
(
arr
)),
key
=
lambda
i
:
arr
[
i
][
1
],
reverse
=
True
)
# Initialize k empty partitions
partitions
=
[[]
for
_
in
range
(
k
)]
partition_sums
=
[
0
]
*
k
# Assign each element to the partition with the smallest current sum
for
idx
in
sorted_indices
:
# Find the partition with the smallest sum
min_sum_idx
=
partition_sums
.
index
(
min
(
partition_sums
))
# Add the element to this partition
partitions
[
min_sum_idx
].
append
(
idx
)
# Store the original index
partition_sums
[
min_sum_idx
]
+=
arr
[
idx
][
1
]
return
partitions
def
process_pdf_batch
(
pdf_jobs
,
idx
):
"""Process a batch of PDF pages using multiple threads.
Parameters:
-----------
pdf_jobs : list of tuples
List of (pdf_path, page_num) tuples
output_dir : str or None
Directory to save images to
num_threads : int
Number of threads to use
**kwargs :
Additional arguments for process_pdf_page
Returns:
--------
images : list
List of processed images
"""
images
=
[]
for
pdf_path
,
_
in
pdf_jobs
:
doc
=
fitz
.
open
(
pdf_path
)
tmp
=
[]
for
page_num
in
range
(
len
(
doc
)):
page
=
doc
[
page_num
]
tmp
.
append
(
fitz_doc_to_image
(
page
))
images
.
append
(
tmp
)
return
(
idx
,
images
)
def
batch_build_dataset
(
pdf_paths
,
k
,
lang
=
None
):
"""Process multiple PDFs by partitioning them into k balanced parts and
processing each part in parallel.
Parameters:
-----------
pdf_paths : list
List of paths to PDF files
k : int
Number of partitions to create
output_dir : str or None
Directory to save images to
threads_per_worker : int
Number of threads to use per worker
**kwargs :
Additional arguments for process_pdf_page
Returns:
--------
all_images : list
List of all processed images
"""
results
=
[]
for
pdf_path
in
pdf_paths
:
with
open
(
pdf_path
,
'rb'
)
as
f
:
pdf_bytes
=
f
.
read
()
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
results
.
append
(
dataset
)
return
results
#
# # Get page counts for each PDF
# pdf_info = []
# total_pages = 0
#
# for pdf_path in pdf_paths:
# try:
# doc = fitz.open(pdf_path)
# num_pages = len(doc)
# pdf_info.append((pdf_path, num_pages))
# total_pages += num_pages
# doc.close()
# except Exception as e:
# print(f'Error opening {pdf_path}: {e}')
#
# # Partition the jobs based on page countEach job has 1 page
# partitions = partition_array_greedy(pdf_info, k)
#
# # Process each partition in parallel
# all_images_h = {}
#
# with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
# # Submit one task per partition
# futures = []
# for sn, partition in enumerate(partitions):
# # Get the jobs for this partition
# partition_jobs = [pdf_info[idx] for idx in partition]
#
# # Submit the task
# future = executor.submit(
# process_pdf_batch,
# partition_jobs,
# sn
# )
# futures.append(future)
# # Process results as they complete
# for i, future in enumerate(concurrent.futures.as_completed(futures)):
# try:
# idx, images = future.result()
# all_images_h[idx] = images
# except Exception as e:
# print(f'Error processing partition: {e}')
# results = [None] * len(pdf_paths)
# for i in range(len(partitions)):
# partition = partitions[i]
# for j in range(len(partition)):
# with open(pdf_info[partition[j]][0], 'rb') as f:
# pdf_bytes = f.read()
# dataset = PymuDocDataset(pdf_bytes, lang=lang)
# dataset.set_images(all_images_h[i][j])
# results[partition[j]] = dataset
# return results
\ No newline at end of file
magic_pdf/data/data_reader_writer/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.data.data_reader_writer.filebase
import
\
FileBasedDataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.filebase
import
\
FileBasedDataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.multi_bucket_s3
import
\
MultiBucketS3DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.multi_bucket_s3
import
\
MultiBucketS3DataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.s3
import
S3DataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.base
import
DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.base
import
DataWriter
# noqa: F401
\ No newline at end of file
magic_pdf/data/dataset.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
from
abc
import
ABC
,
abstractmethod
from
typing
import
Callable
,
Iterator
import
fitz
from
loguru
import
logger
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.schemas
import
PageInfo
from
magic_pdf.data.utils
import
fitz_doc_to_image
from
magic_pdf.filter
import
classify
class
PageableData
(
ABC
):
@
abstractmethod
def
get_image
(
self
)
->
dict
:
"""Transform data to image."""
pass
@
abstractmethod
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc page."""
pass
@
abstractmethod
def
get_page_info
(
self
)
->
PageInfo
:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
pass
@
abstractmethod
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
pass
@
abstractmethod
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
"""
pass
class
Dataset
(
ABC
):
@
abstractmethod
def
__len__
(
self
)
->
int
:
"""The length of the dataset."""
pass
@
abstractmethod
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page data."""
pass
@
abstractmethod
def
supported_methods
(
self
)
->
list
[
SupportedPdfParseMethod
]:
"""The methods that this dataset support.
Returns:
list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
"""
pass
@
abstractmethod
def
data_bits
(
self
)
->
bytes
:
"""The bits used to create this dataset."""
pass
@
abstractmethod
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""Get the page indexed by page_id.
Args:
page_id (int): the index of the page
Returns:
PageableData: the page doc object
"""
pass
@
abstractmethod
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
pass
@
abstractmethod
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
@
abstractmethod
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
pass
@
abstractmethod
def
clone
(
self
):
"""clone this dataset."""
pass
class
PymuDocDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
,
lang
=
None
):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the pdf
"""
self
.
_raw_fitz
=
fitz
.
open
(
'pdf'
,
bits
)
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_data_bits
=
bits
self
.
_raw_data
=
bits
self
.
_classify_result
=
None
if
lang
==
''
:
self
.
_lang
=
None
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
self
.
_data_bits
)
logger
.
info
(
f
'lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
self
.
_lang
=
lang
logger
.
info
(
f
'lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
"""The page number of the pdf."""
return
len
(
self
.
_records
)
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page doc object."""
return
iter
(
self
.
_records
)
def
supported_methods
(
self
)
->
list
[
SupportedPdfParseMethod
]:
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return
[
SupportedPdfParseMethod
.
OCR
,
SupportedPdfParseMethod
.
TXT
]
def
data_bits
(
self
)
->
bytes
:
"""The pdf bits used to create this dataset."""
return
self
.
_data_bits
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
if
'lang'
in
kwargs
and
self
.
_lang
is
not
None
:
kwargs
[
'lang'
]
=
self
.
_lang
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
if
self
.
_classify_result
is
None
:
self
.
_classify_result
=
classify
(
self
.
_data_bits
)
return
self
.
_classify_result
def
clone
(
self
):
"""clone this dataset."""
return
PymuDocDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
for
i
in
range
(
len
(
self
.
_records
)):
self
.
_records
[
i
].
set_image
(
images
[
i
])
class
ImageDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
,
lang
=
None
):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
"""
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
self
.
_raw_fitz
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_raw_data
=
bits
self
.
_data_bits
=
pdf_bytes
if
lang
==
''
:
self
.
_lang
=
None
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
self
.
_data_bits
)
logger
.
info
(
f
'lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
self
.
_lang
=
lang
logger
.
info
(
f
'lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
"""The length of the dataset."""
return
len
(
self
.
_records
)
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page object."""
return
iter
(
self
.
_records
)
def
supported_methods
(
self
):
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return
[
SupportedPdfParseMethod
.
OCR
]
def
data_bits
(
self
)
->
bytes
:
"""The pdf bits used to create this dataset."""
return
self
.
_data_bits
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
return
SupportedPdfParseMethod
.
OCR
def
clone
(
self
):
"""clone this dataset."""
return
ImageDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
for
i
in
range
(
len
(
self
.
_records
)):
self
.
_records
[
i
].
set_image
(
images
[
i
])
class
Doc
(
PageableData
):
"""Initialized with pymudoc object."""
def
__init__
(
self
,
doc
:
fitz
.
Page
):
self
.
_doc
=
doc
self
.
_img
=
None
def
get_image
(
self
):
"""Return the image info.
Returns:
dict: {
img: np.ndarray,
width: int,
height: int
}
"""
if
self
.
_img
is
None
:
self
.
_img
=
fitz_doc_to_image
(
self
.
_doc
)
return
self
.
_img
def
set_image
(
self
,
img
):
"""
Args:
img (np.ndarray): the image
"""
if
self
.
_img
is
None
:
self
.
_img
=
img
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc object.
Returns:
fitz.Page: the pymudoc object
"""
return
self
.
_doc
def
get_page_info
(
self
)
->
PageInfo
:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
page_w
=
self
.
_doc
.
rect
.
width
page_h
=
self
.
_doc
.
rect
.
height
return
PageInfo
(
w
=
page_w
,
h
=
page_h
)
def
__getattr__
(
self
,
name
):
if
hasattr
(
self
.
_doc
,
name
):
return
getattr
(
self
.
_doc
,
name
)
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
self
.
_doc
.
draw_rect
(
rect_coords
,
color
=
color
,
fill
=
fill
,
fill_opacity
=
fill_opacity
,
width
=
width
,
overlay
=
overlay
,
)
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
"""
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
\ No newline at end of file
magic_pdf/data/io/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.data.io.base
import
IOReader
,
IOWriter
# noqa: F401
from
magic_pdf.data.io.http
import
HttpReader
,
HttpWriter
# noqa: F401
from
magic_pdf.data.io.s3
import
S3Reader
,
S3Writer
# noqa: F401
__all__
=
[
'IOReader'
,
'IOWriter'
,
'HttpReader'
,
'HttpWriter'
,
'S3Reader'
,
'S3Writer'
]
\ No newline at end of file
magic_pdf/data/read_api.py
deleted
100644 → 0
View file @
3bd0ecf1
import
json
import
os
import
tempfile
import
shutil
from
pathlib
import
Path
from
magic_pdf.config.exceptions
import
EmptyData
,
InvalidParams
from
magic_pdf.data.data_reader_writer
import
(
FileBasedDataReader
,
MultiBucketS3DataReader
)
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.utils.office_to_pdf
import
convert_file_to_pdf
,
ConvertToPdfError
def
read_jsonl
(
s3_path_or_local
:
str
,
s3_client
:
MultiBucketS3DataReader
|
None
=
None
)
->
list
[
PymuDocDataset
]:
"""Read the jsonl file and return the list of PymuDocDataset.
Args:
s3_path_or_local (str): local file or s3 path
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
Raises:
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
EmptyData: if no pdf file location is provided in some line of jsonl file.
InvalidParams: if the file location is s3 path but s3_client is not provided
Returns:
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
"""
bits_arr
=
[]
if
s3_path_or_local
.
startswith
(
's3://'
):
if
s3_client
is
None
:
raise
InvalidParams
(
's3_client is required when s3_path is provided'
)
jsonl_bits
=
s3_client
.
read
(
s3_path_or_local
)
else
:
jsonl_bits
=
FileBasedDataReader
(
''
).
read
(
s3_path_or_local
)
jsonl_d
=
[
json
.
loads
(
line
)
for
line
in
jsonl_bits
.
decode
().
split
(
'
\n
'
)
if
line
.
strip
()
]
for
d
in
jsonl_d
:
pdf_path
=
d
.
get
(
'file_location'
,
''
)
or
d
.
get
(
'path'
,
''
)
if
len
(
pdf_path
)
==
0
:
raise
EmptyData
(
'pdf file location is empty'
)
if
pdf_path
.
startswith
(
's3://'
):
if
s3_client
is
None
:
raise
InvalidParams
(
's3_client is required when s3_path is provided'
)
bits_arr
.
append
(
s3_client
.
read
(
pdf_path
))
else
:
bits_arr
.
append
(
FileBasedDataReader
(
''
).
read
(
pdf_path
))
return
[
PymuDocDataset
(
bits
)
for
bits
in
bits_arr
]
def
read_local_pdfs
(
path
:
str
)
->
list
[
PymuDocDataset
]:
"""Read pdf from path or directory.
Args:
path (str): pdf file path or directory that contains pdf files
Returns:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
"""
if
os
.
path
.
isdir
(
path
):
reader
=
FileBasedDataReader
()
ret
=
[]
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
file
.
split
(
'.'
)
if
suffix
[
-
1
]
==
'pdf'
:
ret
.
append
(
PymuDocDataset
(
reader
.
read
(
os
.
path
.
join
(
root
,
file
))))
return
ret
else
:
reader
=
FileBasedDataReader
()
bits
=
reader
.
read
(
path
)
return
[
PymuDocDataset
(
bits
)]
def
read_local_office
(
path
:
str
)
->
list
[
PymuDocDataset
]:
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
Args:
path (str): ms-office file or directory that contains ms-office files
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
Raises:
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes
=
[
'.ppt'
,
'.pptx'
,
'.doc'
,
'.docx'
]
fns
=
[]
ret
=
[]
if
os
.
path
.
isdir
(
path
):
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
Path
(
file
).
suffix
if
suffix
in
suffixes
:
fns
.
append
((
os
.
path
.
join
(
root
,
file
)))
else
:
fns
.
append
(
path
)
reader
=
FileBasedDataReader
()
temp_dir
=
tempfile
.
mkdtemp
()
for
fn
in
fns
:
try
:
convert_file_to_pdf
(
fn
,
temp_dir
)
except
ConvertToPdfError
as
e
:
raise
e
except
FileNotFoundError
as
e
:
raise
e
except
Exception
as
e
:
raise
e
fn_path
=
Path
(
fn
)
pdf_fn
=
f
"
{
temp_dir
}
/
{
fn_path
.
stem
}
.pdf"
ret
.
append
(
PymuDocDataset
(
reader
.
read
(
pdf_fn
)))
shutil
.
rmtree
(
temp_dir
)
return
ret
def
read_local_images
(
path
:
str
,
suffixes
:
list
[
str
]
=
[
'.png'
,
'.jpg'
,
'.jpeg'
])
->
list
[
ImageDataset
]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
"""
if
os
.
path
.
isdir
(
path
):
imgs_bits
=
[]
s_suffixes
=
set
(
suffixes
)
reader
=
FileBasedDataReader
()
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
Path
(
file
).
suffix
if
suffix
in
s_suffixes
:
imgs_bits
.
append
(
reader
.
read
(
os
.
path
.
join
(
root
,
file
)))
return
[
ImageDataset
(
bits
)
for
bits
in
imgs_bits
]
else
:
reader
=
FileBasedDataReader
()
bits
=
reader
.
read
(
path
)
return
[
ImageDataset
(
bits
)]
magic_pdf/data/utils.py
deleted
100644 → 0
View file @
3bd0ecf1
import
multiprocessing
as
mp
import
threading
from
concurrent.futures
import
(
ProcessPoolExecutor
,
ThreadPoolExecutor
,
as_completed
)
import
fitz
import
numpy
as
np
from
loguru
import
logger
def
fitz_doc_to_image
(
page
,
dpi
=
200
)
->
dict
:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
page (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 4500 after scaling, do not scale further.
if
pm
.
width
>
4500
or
pm
.
height
>
4500
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# Convert pixmap samples directly to numpy array
img
=
np
.
frombuffer
(
pm
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pm
.
height
,
pm
.
width
,
3
)
img_dict
=
{
'img'
:
img
,
'width'
:
pm
.
width
,
'height'
:
pm
.
height
}
return
img_dict
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
,
start_page_id
=
0
,
end_page_id
=
None
)
->
list
:
images
=
[]
with
fitz
.
open
(
'pdf'
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
)
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
'end_page_id is out of range, use images length'
)
end_page_id
=
pdf_page_num
-
1
for
index
in
range
(
0
,
doc
.
page_count
):
if
start_page_id
<=
index
<=
end_page_id
:
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 4500 after scaling, do not scale further.
if
pm
.
width
>
4500
or
pm
.
height
>
4500
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# Convert pixmap samples directly to numpy array
img
=
np
.
frombuffer
(
pm
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pm
.
height
,
pm
.
width
,
3
)
img_dict
=
{
'img'
:
img
,
'width'
:
pm
.
width
,
'height'
:
pm
.
height
}
else
:
img_dict
=
{
'img'
:
[],
'width'
:
0
,
'height'
:
0
}
images
.
append
(
img_dict
)
return
images
def
convert_page
(
bytes_page
):
pdfs
=
fitz
.
open
(
'pdf'
,
bytes_page
)
page
=
pdfs
[
0
]
return
fitz_doc_to_image
(
page
)
def
parallel_process_pdf_safe
(
pages
,
num_workers
=
None
,
**
kwargs
):
"""Process PDF pages in parallel with serialization-safe approach."""
if
num_workers
is
None
:
num_workers
=
mp
.
cpu_count
()
# Process the extracted page data in parallel
with
ProcessPoolExecutor
(
max_workers
=
num_workers
)
as
executor
:
# Process the page data
results
=
list
(
executor
.
map
(
convert_page
,
pages
)
)
return
results
def
threaded_process_pdf
(
pdf_path
,
num_threads
=
4
,
**
kwargs
):
"""Process all pages of a PDF using multiple threads.
Parameters:
-----------
pdf_path : str
Path to the PDF file
num_threads : int
Number of threads to use
**kwargs :
Additional arguments for fitz_doc_to_image
Returns:
--------
images : list
List of processed images, in page order
"""
# Open the PDF
doc
=
fitz
.
open
(
pdf_path
)
num_pages
=
len
(
doc
)
# Create a list to store results in the correct order
results
=
[
None
]
*
num_pages
# Create a thread pool
with
ThreadPoolExecutor
(
max_workers
=
num_threads
)
as
executor
:
# Submit all tasks
futures
=
{}
for
page_num
in
range
(
num_pages
):
page
=
doc
[
page_num
]
future
=
executor
.
submit
(
fitz_doc_to_image
,
page
,
**
kwargs
)
futures
[
future
]
=
page_num
# Process results as they complete with progress bar
for
future
in
as_completed
(
futures
):
page_num
=
futures
[
future
]
try
:
results
[
page_num
]
=
future
.
result
()
except
Exception
as
e
:
print
(
f
'Error processing page
{
page_num
}
:
{
e
}
'
)
results
[
page_num
]
=
None
# Close the document
doc
.
close
()
if
__name__
==
'__main__'
:
pdf
=
fitz
.
open
(
'/tmp/[MS-DOC].pdf'
)
pdf_page
=
[
fitz
.
open
()
for
i
in
range
(
pdf
.
page_count
)]
[
pdf_page
[
i
].
insert_pdf
(
pdf
,
from_page
=
i
,
to_page
=
i
)
for
i
in
range
(
pdf
.
page_count
)]
pdf_page
=
[
v
.
tobytes
()
for
v
in
pdf_page
]
results
=
parallel_process_pdf_safe
(
pdf_page
,
num_workers
=
16
)
# threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
""" benchmark results of multi-threaded processing (fitz page to image)
total page nums: 578
thread nums, time cost
1 7.351 sec
2 6.334 sec
4 5.968 sec
8 6.728 sec
16 8.085 sec
"""
""" benchmark results of multi-processor processing (fitz page to image)
total page nums: 578
processor nums, time cost
1 17.170 sec
2 10.170 sec
4 7.841 sec
8 7.900 sec
16 7.984 sec
"""
magic_pdf/filter/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.filter.pdf_classify_by_type
import
classify
as
do_classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
def
classify
(
pdf_bytes
:
bytes
)
->
SupportedPdfParseMethod
:
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
pdf_meta
=
pdf_meta_scan
(
pdf_bytes
)
if
pdf_meta
.
get
(
'_need_drop'
,
False
):
# 如果返回了需要丢弃的标志,则抛出异常
raise
Exception
(
f
"pdf meta_scan need_drop,reason is
{
pdf_meta
[
'_drop_reason'
]
}
"
)
else
:
is_encrypted
=
pdf_meta
[
'is_encrypted'
]
is_needs_password
=
pdf_meta
[
'is_needs_password'
]
if
is_encrypted
or
is_needs_password
:
# 加密的,需要密码的,没有页面的,都不处理
raise
Exception
(
f
'pdf meta_scan need_drop,reason is
{
DropReason
.
ENCRYPTED
}
'
)
else
:
is_text_pdf
,
results
=
do_classify
(
pdf_meta
[
'total_page'
],
pdf_meta
[
'page_width_pts'
],
pdf_meta
[
'page_height_pts'
],
pdf_meta
[
'image_info_per_page'
],
pdf_meta
[
'text_len_per_page'
],
pdf_meta
[
'imgs_per_page'
],
# pdf_meta['text_layout_per_page'],
pdf_meta
[
'invalid_chars'
],
)
if
is_text_pdf
:
return
SupportedPdfParseMethod
.
TXT
else
:
return
SupportedPdfParseMethod
.
OCR
magic_pdf/filter/pdf_classify_by_type.py
deleted
100644 → 0
View file @
3bd0ecf1
This diff is collapsed.
Click to expand it.
magic_pdf/filter/pdf_meta_scan.py
deleted
100644 → 0
View file @
3bd0ecf1
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
from
collections
import
Counter
import
fitz
from
loguru
import
logger
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.libs.commons
import
get_top_percent_list
,
mymax
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.pdf_check
import
detect_invalid_chars_by_pymupdf
,
detect_invalid_chars
scan_max_page
=
50
junk_limit_min
=
10
def
calculate_max_image_area_per_page
(
result
:
list
,
page_width_pts
,
page_height_pts
):
max_image_area_per_page
=
[
mymax
([(
x1
-
x0
)
*
(
y1
-
y0
)
for
x0
,
y0
,
x1
,
y1
,
_
in
page_img_sz
])
for
page_img_sz
in
result
]
page_area
=
int
(
page_width_pts
)
*
int
(
page_height_pts
)
max_image_area_per_page
=
[
area
/
page_area
for
area
in
max_image_area_per_page
]
max_image_area_per_page
=
[
area
for
area
in
max_image_area_per_page
if
area
>
0.6
]
return
max_image_area_per_page
def
process_image
(
page
,
junk_img_bojids
=
[]):
page_result
=
[]
# 存每个页面里的多张图四元组信息
items
=
page
.
get_images
()
dedup
=
set
()
for
img
in
items
:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid
=
img
[
0
]
# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if
img_bojid
in
junk_img_bojids
:
# 如果是垃圾图像,就跳过
continue
recs
=
page
.
get_image_rects
(
img
,
transform
=
True
)
if
recs
:
rec
=
recs
[
0
][
0
]
x0
,
y0
,
x1
,
y1
=
map
(
int
,
rec
)
width
=
x1
-
x0
height
=
y1
-
y0
if
(
x0
,
y0
,
x1
,
y1
,
img_bojid
,
)
in
dedup
:
# 这里面会出现一些重复的bbox,无需重复出现,需要去掉
continue
if
not
all
(
[
width
,
height
]
):
# 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
continue
dedup
.
add
((
x0
,
y0
,
x1
,
y1
,
img_bojid
))
page_result
.
append
([
x0
,
y0
,
x1
,
y1
,
img_bojid
])
return
page_result
def
get_image_info
(
doc
:
fitz
.
Document
,
page_width_pts
,
page_height_pts
)
->
list
:
"""返回每个页面里的图片的四元组,每个页面多个图片。
:param doc:
:return:
"""
# 使用 Counter 计数 img_bojid 的出现次数
img_bojid_counter
=
Counter
(
img
[
0
]
for
page
in
doc
for
img
in
page
.
get_images
())
# 找出出现次数超过 len(doc) 半数的 img_bojid
junk_limit
=
max
(
len
(
doc
)
*
0.5
,
junk_limit_min
)
# 对一些页数比较少的进行豁免
junk_img_bojids
=
[
img_bojid
for
img_bojid
,
count
in
img_bojid_counter
.
items
()
if
count
>=
junk_limit
]
# todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
# 有两种扫描版,一种文字版,这里可能会有误判
# 扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
# 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
# 文 字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
imgs_len_list
=
[
len
(
page
.
get_images
())
for
page
in
doc
]
special_limit_pages
=
10
# 统一用前十页结果做判断
result
=
[]
break_loop
=
False
for
i
,
page
in
enumerate
(
doc
):
if
break_loop
:
break
if
i
>=
special_limit_pages
:
break
page_result
=
process_image
(
page
)
# 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
result
.
append
(
page_result
)
for
item
in
result
:
if
not
any
(
item
):
# 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if
(
max
(
imgs_len_list
)
==
min
(
imgs_len_list
)
and
max
(
imgs_len_list
)
>=
junk_limit_min
):
# 如果是特殊文字版,就把junklist置空并break
junk_img_bojids
=
[]
else
:
# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
pass
break_loop
=
True
break
if
not
break_loop
:
# 获取前80%的元素
top_eighty_percent
=
get_top_percent_list
(
imgs_len_list
,
0.8
)
# 检查前80%的元素是否都相等
if
len
(
set
(
top_eighty_percent
))
==
1
and
max
(
imgs_len_list
)
>=
junk_limit_min
:
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
# 前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page
=
calculate_max_image_area_per_page
(
result
,
page_width_pts
,
page_height_pts
)
if
(
len
(
max_image_area_per_page
)
<
0.8
*
special_limit_pages
):
# 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
junk_img_bojids
=
[]
else
:
# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
pass
else
:
# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
junk_img_bojids
=
[]
# 正式进入取前50页图片的信息流程
result
=
[]
for
i
,
page
in
enumerate
(
doc
):
if
i
>=
scan_max_page
:
break
page_result
=
process_image
(
page
,
junk_img_bojids
)
# logger.info(f"page {i} img_len: {len(page_result)}")
result
.
append
(
page_result
)
return
result
,
junk_img_bojids
def
get_pdf_page_size_pts
(
doc
:
fitz
.
Document
):
page_cnt
=
len
(
doc
)
l
:
int
=
min
(
page_cnt
,
50
)
# 把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
page_width_list
=
[]
page_height_list
=
[]
for
i
in
range
(
l
):
page
=
doc
[
i
]
page_rect
=
page
.
rect
page_width_list
.
append
(
page_rect
.
width
)
page_height_list
.
append
(
page_rect
.
height
)
page_width_list
.
sort
()
page_height_list
.
sort
()
median_width
=
page_width_list
[
len
(
page_width_list
)
//
2
]
median_height
=
page_height_list
[
len
(
page_height_list
)
//
2
]
return
median_width
,
median_height
def
get_pdf_textlen_per_page
(
doc
:
fitz
.
Document
):
text_len_lst
=
[]
for
page
in
doc
:
# 拿包含img和text的所有blocks
# text_block = page.get_text("blocks")
# 拿所有text的blocks
# text_block = page.get_text("words")
# text_block_len = sum([len(t[4]) for t in text_block])
# 拿所有text的str
text_block
=
page
.
get_text
(
'text'
)
text_block_len
=
len
(
text_block
)
# logger.info(f"page {page.number} text_block_len: {text_block_len}")
text_len_lst
.
append
(
text_block_len
)
return
text_len_lst
def
get_pdf_text_layout_per_page
(
doc
:
fitz
.
Document
):
"""根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
List[str]: 每一页的文本布局(横向、纵向、未知)。
"""
text_layout_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
if
page_id
>=
scan_max_page
:
break
# 创建每一页的纵向和横向的文本行数计数器
vertical_count
=
0
horizontal_count
=
0
text_dict
=
page
.
get_text
(
'dict'
)
if
'blocks'
in
text_dict
:
for
block
in
text_dict
[
'blocks'
]:
if
'lines'
in
block
:
for
line
in
block
[
'lines'
]:
# 获取line的bbox顶点坐标
x0
,
y0
,
x1
,
y1
=
line
[
'bbox'
]
# 计算bbox的宽高
width
=
x1
-
x0
height
=
y1
-
y0
# 计算bbox的面积
area
=
width
*
height
font_sizes
=
[]
for
span
in
line
[
'spans'
]:
if
'size'
in
span
:
font_sizes
.
append
(
span
[
'size'
])
if
len
(
font_sizes
)
>
0
:
average_font_size
=
sum
(
font_sizes
)
/
len
(
font_sizes
)
else
:
average_font_size
=
(
10
# 有的line拿不到font_size,先定一个阈值100
)
if
(
area
<=
average_font_size
**
2
):
# 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
continue
else
:
if
'wmode'
in
line
:
# 通过wmode判断文本方向
if
line
[
'wmode'
]
==
1
:
# 判断是否为竖向文本
vertical_count
+=
1
elif
line
[
'wmode'
]
==
0
:
# 判断是否为横向文本
horizontal_count
+=
1
# if 'dir' in line: # 通过旋转角度计算判断文本方向
# # 获取行的 "dir" 值
# dir_value = line['dir']
# cosine, sine = dir_value
# # 计算角度
# angle = math.degrees(math.acos(cosine))
#
# # 判断是否为横向文本
# if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
# # line_text = ' '.join(span['text'] for span in line['spans'])
# # print('This line is horizontal:', line_text)
# horizontal_count += 1
# # 判断是否为纵向文本
# elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
# # line_text = ' '.join(span['text'] for span in line['spans'])
# # print('This line is vertical:', line_text)
# vertical_count += 1
# print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
# 判断每一页的文本布局
if
vertical_count
==
0
and
horizontal_count
==
0
:
# 该页没有文本,无法判断
text_layout_list
.
append
(
'unknow'
)
continue
else
:
if
vertical_count
>
horizontal_count
:
# 该页的文本纵向行数大于横向的
text_layout_list
.
append
(
'vertical'
)
else
:
# 该页的文本横向行数大于纵向的
text_layout_list
.
append
(
'horizontal'
)
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return
text_layout_list
"""定义一个自定义异常用来抛出单页svg太多的pdf"""
class
PageSvgsTooManyError
(
Exception
):
def
__init__
(
self
,
message
=
'Page SVGs are too many'
):
self
.
message
=
message
super
().
__init__
(
self
.
message
)
def
get_svgs_per_page
(
doc
:
fitz
.
Document
):
svgs_len_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
# svgs = page.get_drawings()
svgs
=
page
.
get_cdrawings
()
# 切换成get_cdrawings,效率更高
len_svgs
=
len
(
svgs
)
if
len_svgs
>=
3000
:
raise
PageSvgsTooManyError
()
else
:
svgs_len_list
.
append
(
len_svgs
)
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
return
svgs_len_list
def
get_imgs_per_page
(
doc
:
fitz
.
Document
):
imgs_len_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
imgs
=
page
.
get_images
()
imgs_len_list
.
append
(
len
(
imgs
))
# logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
return
imgs_len_list
def
get_language
(
doc
:
fitz
.
Document
):
"""
获取PDF文档的语言。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
str: 文档语言,如 "en-US"。
"""
language_lst
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
if
page_id
>=
scan_max_page
:
break
# 拿所有text的str
text_block
=
page
.
get_text
(
'text'
)
page_language
=
detect_lang
(
text_block
)
language_lst
.
append
(
page_language
)
# logger.info(f"page_id: {page_id}, page_language: {page_language}")
# 统计text_language_list中每种语言的个数
count_dict
=
Counter
(
language_lst
)
# 输出text_language_list中出现的次数最多的语言
language
=
max
(
count_dict
,
key
=
count_dict
.
get
)
return
language
def
check_invalid_chars
(
pdf_bytes
):
"""乱码检测."""
# return detect_invalid_chars_by_pymupdf(pdf_bytes)
return
detect_invalid_chars
(
pdf_bytes
)
def
pdf_meta_scan
(
pdf_bytes
:
bytes
):
"""
:param s3_pdf_path:
:param pdf_bytes: pdf文件的二进制数据
几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取
"""
doc
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
is_needs_password
=
doc
.
needs_pass
is_encrypted
=
doc
.
is_encrypted
total_page
=
len
(
doc
)
if
total_page
==
0
:
logger
.
warning
(
f
'drop this pdf, drop_reason:
{
DropReason
.
EMPTY_PDF
}
'
)
result
=
{
'_need_drop'
:
True
,
'_drop_reason'
:
DropReason
.
EMPTY_PDF
}
return
result
else
:
page_width_pts
,
page_height_pts
=
get_pdf_page_size_pts
(
doc
)
# logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
# svgs_per_page = get_svgs_per_page(doc)
# logger.info(f"svgs_per_page: {svgs_per_page}")
imgs_per_page
=
get_imgs_per_page
(
doc
)
# logger.info(f"imgs_per_page: {imgs_per_page}")
image_info_per_page
,
junk_img_bojids
=
get_image_info
(
doc
,
page_width_pts
,
page_height_pts
)
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page
=
get_pdf_textlen_per_page
(
doc
)
# logger.info(f"text_len_per_page: {text_len_per_page}")
# text_layout_per_page = get_pdf_text_layout_per_page(doc)
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
# text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")
invalid_chars
=
check_invalid_chars
(
pdf_bytes
)
# logger.info(f"invalid_chars: {invalid_chars}")
# 最后输出一条json
res
=
{
'is_needs_password'
:
is_needs_password
,
'is_encrypted'
:
is_encrypted
,
'total_page'
:
total_page
,
'page_width_pts'
:
int
(
page_width_pts
),
'page_height_pts'
:
int
(
page_height_pts
),
'image_info_per_page'
:
image_info_per_page
,
'text_len_per_page'
:
text_len_per_page
,
# 'text_layout_per_page': text_layout_per_page,
# 'text_language': text_language,
# "svgs_per_page": svgs_per_page,
'imgs_per_page'
:
imgs_per_page
,
# 增加每页img数量list
'junk_img_bojids'
:
junk_img_bojids
,
# 增加垃圾图片的bojid list
'invalid_chars'
:
invalid_chars
,
'metadata'
:
doc
.
metadata
,
}
# logger.info(json.dumps(res, ensure_ascii=False))
return
res
if
__name__
==
'__main__'
:
pass
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
# "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
# file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") # noqa: E501
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
# doc = fitz.open("pdf", file_content)
# text_layout_lst = get_pdf_text_layout_per_page(doc)
# print(text_layout_lst)
magic_pdf/integrations/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/integrations/rag/__init__.py
deleted
100644 → 0
View file @
3bd0ecf1
magic_pdf/integrations/rag/api.py
deleted
100644 → 0
View file @
3bd0ecf1
import
os
from
pathlib
import
Path
from
loguru
import
logger
from
magic_pdf.integrations.rag.type
import
(
ElementRelation
,
LayoutElements
,
Node
)
from
magic_pdf.integrations.rag.utils
import
inference
class
RagPageReader
:
def
__init__
(
self
,
pagedata
:
LayoutElements
):
self
.
o
=
[
Node
(
category_type
=
v
.
category_type
,
text
=
v
.
text
,
image_path
=
v
.
image_path
,
anno_id
=
v
.
anno_id
,
latex
=
v
.
latex
,
html
=
v
.
html
,
)
for
v
in
pagedata
.
layout_dets
]
self
.
pagedata
=
pagedata
def
__iter__
(
self
):
return
iter
(
self
.
o
)
def
get_rel_map
(
self
)
->
list
[
ElementRelation
]:
return
self
.
pagedata
.
extra
.
element_relation
class
RagDocumentReader
:
def
__init__
(
self
,
ragdata
:
list
[
LayoutElements
]):
self
.
o
=
[
RagPageReader
(
v
)
for
v
in
ragdata
]
def
__iter__
(
self
):
return
iter
(
self
.
o
)
class
DataReader
:
def
__init__
(
self
,
path_or_directory
:
str
,
method
:
str
,
output_dir
:
str
):
self
.
path_or_directory
=
path_or_directory
self
.
method
=
method
self
.
output_dir
=
output_dir
self
.
pdfs
=
[]
if
os
.
path
.
isdir
(
path_or_directory
):
for
doc_path
in
Path
(
path_or_directory
).
glob
(
'*.pdf'
):
self
.
pdfs
.
append
(
doc_path
)
else
:
assert
path_or_directory
.
endswith
(
'.pdf'
)
self
.
pdfs
.
append
(
Path
(
path_or_directory
))
def
get_documents_count
(
self
)
->
int
:
"""Returns the number of documents in the directory."""
return
len
(
self
.
pdfs
)
def
get_document_result
(
self
,
idx
:
int
)
->
RagDocumentReader
|
None
:
"""
Args:
idx (int): the index of documents under the
directory path_or_directory
Returns:
RagDocumentReader | None: RagDocumentReader is an iterable object,
more details @RagDocumentReader
"""
if
idx
>=
self
.
get_documents_count
()
or
idx
<
0
:
logger
.
error
(
f
'invalid idx:
{
idx
}
'
)
return
None
res
=
inference
(
str
(
self
.
pdfs
[
idx
]),
self
.
output_dir
,
self
.
method
)
if
res
is
None
:
logger
.
warning
(
f
'failed to inference pdf
{
self
.
pdfs
[
idx
]
}
'
)
return
None
return
RagDocumentReader
(
res
)
def
get_document_filename
(
self
,
idx
:
int
)
->
Path
:
"""get the filename of the document."""
return
self
.
pdfs
[
idx
]
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment