Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
88495c32
Unverified
Commit
88495c32
authored
Jun 11, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jun 11, 2025
Browse files
Merge pull request #13 from myhloli/refactor-mineru2
refactor: refactor-mineru2
parents
ddf5a878
d96d9161
Changes
261
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2036 deletions
+0
-2036
.pre-commit-config.yaml
.pre-commit-config.yaml
+0
-47
magic_pdf/config/constants.py
magic_pdf/config/constants.py
+0
-60
magic_pdf/config/drop_reason.py
magic_pdf/config/drop_reason.py
+0
-35
magic_pdf/config/drop_tag.py
magic_pdf/config/drop_tag.py
+0
-19
magic_pdf/config/enums.py
magic_pdf/config/enums.py
+0
-7
magic_pdf/config/make_content_config.py
magic_pdf/config/make_content_config.py
+0
-11
magic_pdf/config/model_block_type.py
magic_pdf/config/model_block_type.py
+0
-10
magic_pdf/config/ocr_content_type.py
magic_pdf/config/ocr_content_type.py
+0
-40
magic_pdf/data/batch_build_dataset.py
magic_pdf/data/batch_build_dataset.py
+0
-167
magic_pdf/data/data_reader_writer/__init__.py
magic_pdf/data/data_reader_writer/__init__.py
+0
-12
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+0
-408
magic_pdf/data/io/__init__.py
magic_pdf/data/io/__init__.py
+0
-6
magic_pdf/data/read_api.py
magic_pdf/data/read_api.py
+0
-142
magic_pdf/data/utils.py
magic_pdf/data/utils.py
+0
-166
magic_pdf/filter/__init__.py
magic_pdf/filter/__init__.py
+0
-32
magic_pdf/filter/pdf_classify_by_type.py
magic_pdf/filter/pdf_classify_by_type.py
+0
-395
magic_pdf/filter/pdf_meta_scan.py
magic_pdf/filter/pdf_meta_scan.py
+0
-397
magic_pdf/integrations/__init__.py
magic_pdf/integrations/__init__.py
+0
-0
magic_pdf/integrations/rag/__init__.py
magic_pdf/integrations/rag/__init__.py
+0
-0
magic_pdf/integrations/rag/api.py
magic_pdf/integrations/rag/api.py
+0
-82
No files found.
.pre-commit-config.yaml
deleted
100644 → 0
View file @
ddf5a878
repos
:
-
repo
:
https://github.com/PyCQA/flake8
rev
:
5.0.4
hooks
:
-
id
:
flake8
args
:
[
"
--max-line-length=150"
,
"
--ignore=E131,E125,W503,W504,E203"
]
-
repo
:
https://github.com/PyCQA/isort
rev
:
5.11.5
hooks
:
-
id
:
isort
-
repo
:
https://github.com/pre-commit/mirrors-yapf
rev
:
v0.32.0
hooks
:
-
id
:
yapf
args
:
[
"
--style={based_on_style:
google,
column_limit:
150,
indent_width:
4}"
]
-
repo
:
https://github.com/codespell-project/codespell
rev
:
v2.2.1
hooks
:
-
id
:
codespell
args
:
[
'
--skip'
,
'
*.json'
]
-
repo
:
https://github.com/pre-commit/pre-commit-hooks
rev
:
v4.3.0
hooks
:
-
id
:
trailing-whitespace
-
id
:
check-yaml
-
id
:
end-of-file-fixer
-
id
:
requirements-txt-fixer
-
id
:
double-quote-string-fixer
-
id
:
check-merge-conflict
-
id
:
fix-encoding-pragma
args
:
[
"
--remove"
]
-
id
:
mixed-line-ending
args
:
[
"
--fix=lf"
]
-
repo
:
https://github.com/executablebooks/mdformat
rev
:
0.7.9
hooks
:
-
id
:
mdformat
args
:
[
"
--number"
,
"
--table-width"
,
"
200"
]
additional_dependencies
:
-
mdformat-openmmlab
-
mdformat_frontmatter
-
linkify-it-py
-
repo
:
https://github.com/myint/docformatter
rev
:
v1.3.1
hooks
:
-
id
:
docformatter
args
:
[
"
--in-place"
,
"
--wrap-descriptions"
,
"
119"
]
magic_pdf/config/constants.py
deleted
100644 → 0
View file @
ddf5a878
"""span维度自定义字段."""
# span是否是跨页合并的
CROSS_PAGE
=
'cross_page'
"""
block维度自定义字段
"""
# block中lines是否被删除
LINES_DELETED
=
'lines_deleted'
# table recognition max time default value
TABLE_MAX_TIME_VALUE
=
400
# pp_table_result_max_length
TABLE_MAX_LEN
=
480
# table master structure dict
TABLE_MASTER_DICT
=
'table_master_structure_dict.txt'
# table master dir
TABLE_MASTER_DIR
=
'table_structure_tablemaster_infer/'
# pp detect model dir
DETECT_MODEL_DIR
=
'ch_PP-OCRv4_det_infer'
# pp rec model dir
REC_MODEL_DIR
=
'ch_PP-OCRv4_rec_infer'
# pp rec char dict path
REC_CHAR_DICT
=
'ppocr_keys_v1.txt'
# pp rec copy rec directory
PP_REC_DIRECTORY
=
'.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
# pp rec copy det directory
PP_DET_DIRECTORY
=
'.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
class
MODEL_NAME
:
# pp table structure algorithm
TABLE_MASTER
=
'tablemaster'
# struct eqtable
STRUCT_EQTABLE
=
'struct_eqtable'
DocLayout_YOLO
=
'doclayout_yolo'
LAYOUTLMv3
=
'layoutlmv3'
YOLO_V8_MFD
=
'yolo_v8_mfd'
UniMerNet_v2_Small
=
'unimernet_small'
RAPID_TABLE
=
'rapid_table'
YOLO_V11_LangDetect
=
'yolo_v11n_langdetect'
PARSE_TYPE_TXT
=
'txt'
PARSE_TYPE_OCR
=
'ocr'
magic_pdf/config/drop_reason.py
deleted
100644 → 0
View file @
ddf5a878
class
DropReason
:
TEXT_BLCOK_HOR_OVERLAP
=
'text_block_horizontal_overlap'
# 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP
=
(
'useful_block_horizontal_overlap'
# 需保留的block水平覆盖
)
COMPLICATED_LAYOUT
=
'complicated_layout'
# 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS
=
'too_many_layout_columns'
# 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX
=
'color_background_text_box'
# 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS
=
(
'high_computational_load_by_imgs'
# 含特殊图片,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_SVGS
=
(
'high_computational_load_by_svgs'
# 特殊的SVG图,计算量太大,从而丢弃
)
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES
=
'high_computational_load_by_total_pages'
# 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT
=
'missing doc_layout_result'
# 版面分析失败
Exception
=
'_exception'
# 解析中发生异常
ENCRYPTED
=
'encrypted'
# PDF是加密的
EMPTY_PDF
=
'total_page=0'
# PDF页面总数为0
NOT_IS_TEXT_PDF
=
'not_is_text_pdf'
# 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK
=
'dense_single_line_block'
# 无法清晰的分段
TITLE_DETECTION_FAILED
=
'title_detection_failed'
# 探测标题失败
TITLE_LEVEL_FAILED
=
(
'title_level_failed'
# 分析标题级别失败(例如一级、二级、三级标题)
)
PARA_SPLIT_FAILED
=
'para_split_failed'
# 识别段落失败
PARA_MERGE_FAILED
=
'para_merge_failed'
# 段落合并失败
NOT_ALLOW_LANGUAGE
=
'not_allow_language'
# 不支持的语种
SPECIAL_PDF
=
'special_pdf'
PSEUDO_SINGLE_COLUMN
=
'pseudo_single_column'
# 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT
=
'can_not_detect_page_layout'
# 无法分析页面的版面
NEGATIVE_BBOX_AREA
=
'negative_bbox_area'
# 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION
=
(
'overlap_blocks_can_t_separation'
# 无法分离重叠的block
)
magic_pdf/config/drop_tag.py
deleted
100644 → 0
View file @
ddf5a878
COLOR_BG_HEADER_TXT_BLOCK
=
'color_background_header_txt_block'
PAGE_NO
=
'page-no'
# 页码
CONTENT_IN_FOOT_OR_HEADER
=
'in-foot-header-area'
# 页眉页脚内的文本
VERTICAL_TEXT
=
'vertical-text'
# 垂直文本
ROTATE_TEXT
=
'rotate-text'
# 旋转文本
EMPTY_SIDE_BLOCK
=
'empty-side-block'
# 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT
=
'on-image-text'
# 文本在图片上
ON_TABLE_TEXT
=
'on-table-text'
# 文本在表格上
class
DropTag
:
PAGE_NUMBER
=
'page_no'
HEADER
=
'header'
FOOTER
=
'footer'
FOOTNOTE
=
'footnote'
NOT_IN_LAYOUT
=
'not_in_layout'
SPAN_OVERLAP
=
'span_overlap'
BLOCK_OVERLAP
=
'block_overlap'
magic_pdf/config/enums.py
deleted
100644 → 0
View file @
ddf5a878
import
enum
class
SupportedPdfParseMethod
(
enum
.
Enum
):
OCR
=
'ocr'
TXT
=
'txt'
magic_pdf/config/make_content_config.py
deleted
100644 → 0
View file @
ddf5a878
class
MakeMode
:
MM_MD
=
'mm_markdown'
NLP_MD
=
'nlp_markdown'
STANDARD_FORMAT
=
'standard_format'
class
DropMode
:
WHOLE_PDF
=
'whole_pdf'
SINGLE_PAGE
=
'single_page'
NONE
=
'none'
NONE_WITH_REASON
=
'none_with_reason'
magic_pdf/config/model_block_type.py
deleted
100644 → 0
View file @
ddf5a878
from
enum
import
Enum
class
ModelBlockTypeEnum
(
Enum
):
TITLE
=
0
PLAIN_TEXT
=
1
ABANDON
=
2
ISOLATE_FORMULA
=
8
EMBEDDING
=
13
ISOLATED
=
14
magic_pdf/config/ocr_content_type.py
deleted
100644 → 0
View file @
ddf5a878
class
ContentType
:
Image
=
'image'
Table
=
'table'
Text
=
'text'
InlineEquation
=
'inline_equation'
InterlineEquation
=
'interline_equation'
class
BlockType
:
Image
=
'image'
ImageBody
=
'image_body'
ImageCaption
=
'image_caption'
ImageFootnote
=
'image_footnote'
Table
=
'table'
TableBody
=
'table_body'
TableCaption
=
'table_caption'
TableFootnote
=
'table_footnote'
Text
=
'text'
Title
=
'title'
InterlineEquation
=
'interline_equation'
Footnote
=
'footnote'
Discarded
=
'discarded'
List
=
'list'
Index
=
'index'
class
CategoryId
:
Title
=
0
Text
=
1
Abandon
=
2
ImageBody
=
3
ImageCaption
=
4
TableBody
=
5
TableCaption
=
6
TableFootnote
=
7
InterlineEquation_Layout
=
8
InlineEquation
=
13
InterlineEquation_YOLO
=
14
OcrText
=
15
ImageFootnote
=
101
magic_pdf/data/batch_build_dataset.py
deleted
100644 → 0
View file @
ddf5a878
import
concurrent.futures
import
fitz
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.utils
import
fitz_doc_to_image
# PyMuPDF
def
partition_array_greedy
(
arr
,
k
):
"""Partition an array into k parts using a simple greedy approach.
Parameters:
-----------
arr : list
The input array of integers
k : int
Number of partitions to create
Returns:
--------
partitions : list of lists
The k partitions of the array
"""
# Handle edge cases
if
k
<=
0
:
raise
ValueError
(
'k must be a positive integer'
)
if
k
>
len
(
arr
):
k
=
len
(
arr
)
# Adjust k if it's too large
if
k
==
1
:
return
[
list
(
range
(
len
(
arr
)))]
if
k
==
len
(
arr
):
return
[[
i
]
for
i
in
range
(
len
(
arr
))]
# Sort the array in descending order
sorted_indices
=
sorted
(
range
(
len
(
arr
)),
key
=
lambda
i
:
arr
[
i
][
1
],
reverse
=
True
)
# Initialize k empty partitions
partitions
=
[[]
for
_
in
range
(
k
)]
partition_sums
=
[
0
]
*
k
# Assign each element to the partition with the smallest current sum
for
idx
in
sorted_indices
:
# Find the partition with the smallest sum
min_sum_idx
=
partition_sums
.
index
(
min
(
partition_sums
))
# Add the element to this partition
partitions
[
min_sum_idx
].
append
(
idx
)
# Store the original index
partition_sums
[
min_sum_idx
]
+=
arr
[
idx
][
1
]
return
partitions
def
process_pdf_batch
(
pdf_jobs
,
idx
):
"""Process a batch of PDF pages using multiple threads.
Parameters:
-----------
pdf_jobs : list of tuples
List of (pdf_path, page_num) tuples
output_dir : str or None
Directory to save images to
num_threads : int
Number of threads to use
**kwargs :
Additional arguments for process_pdf_page
Returns:
--------
images : list
List of processed images
"""
images
=
[]
for
pdf_path
,
_
in
pdf_jobs
:
doc
=
fitz
.
open
(
pdf_path
)
tmp
=
[]
for
page_num
in
range
(
len
(
doc
)):
page
=
doc
[
page_num
]
tmp
.
append
(
fitz_doc_to_image
(
page
))
images
.
append
(
tmp
)
return
(
idx
,
images
)
def
batch_build_dataset
(
pdf_paths
,
k
,
lang
=
None
):
"""Process multiple PDFs by partitioning them into k balanced parts and
processing each part in parallel.
Parameters:
-----------
pdf_paths : list
List of paths to PDF files
k : int
Number of partitions to create
output_dir : str or None
Directory to save images to
threads_per_worker : int
Number of threads to use per worker
**kwargs :
Additional arguments for process_pdf_page
Returns:
--------
all_images : list
List of all processed images
"""
results
=
[]
for
pdf_path
in
pdf_paths
:
with
open
(
pdf_path
,
'rb'
)
as
f
:
pdf_bytes
=
f
.
read
()
dataset
=
PymuDocDataset
(
pdf_bytes
,
lang
=
lang
)
results
.
append
(
dataset
)
return
results
#
# # Get page counts for each PDF
# pdf_info = []
# total_pages = 0
#
# for pdf_path in pdf_paths:
# try:
# doc = fitz.open(pdf_path)
# num_pages = len(doc)
# pdf_info.append((pdf_path, num_pages))
# total_pages += num_pages
# doc.close()
# except Exception as e:
# print(f'Error opening {pdf_path}: {e}')
#
# # Partition the jobs based on page countEach job has 1 page
# partitions = partition_array_greedy(pdf_info, k)
#
# # Process each partition in parallel
# all_images_h = {}
#
# with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
# # Submit one task per partition
# futures = []
# for sn, partition in enumerate(partitions):
# # Get the jobs for this partition
# partition_jobs = [pdf_info[idx] for idx in partition]
#
# # Submit the task
# future = executor.submit(
# process_pdf_batch,
# partition_jobs,
# sn
# )
# futures.append(future)
# # Process results as they complete
# for i, future in enumerate(concurrent.futures.as_completed(futures)):
# try:
# idx, images = future.result()
# all_images_h[idx] = images
# except Exception as e:
# print(f'Error processing partition: {e}')
# results = [None] * len(pdf_paths)
# for i in range(len(partitions)):
# partition = partitions[i]
# for j in range(len(partition)):
# with open(pdf_info[partition[j]][0], 'rb') as f:
# pdf_bytes = f.read()
# dataset = PymuDocDataset(pdf_bytes, lang=lang)
# dataset.set_images(all_images_h[i][j])
# results[partition[j]] = dataset
# return results
\ No newline at end of file
magic_pdf/data/data_reader_writer/__init__.py
deleted
100644 → 0
View file @
ddf5a878
from
magic_pdf.data.data_reader_writer.filebase
import
\
FileBasedDataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.filebase
import
\
FileBasedDataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.multi_bucket_s3
import
\
MultiBucketS3DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.multi_bucket_s3
import
\
MultiBucketS3DataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.s3
import
S3DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.s3
import
S3DataWriter
# noqa: F401
from
magic_pdf.data.data_reader_writer.base
import
DataReader
# noqa: F401
from
magic_pdf.data.data_reader_writer.base
import
DataWriter
# noqa: F401
\ No newline at end of file
magic_pdf/data/dataset.py
deleted
100644 → 0
View file @
ddf5a878
import
os
from
abc
import
ABC
,
abstractmethod
from
typing
import
Callable
,
Iterator
import
fitz
from
loguru
import
logger
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.schemas
import
PageInfo
from
magic_pdf.data.utils
import
fitz_doc_to_image
from
magic_pdf.filter
import
classify
class
PageableData
(
ABC
):
@
abstractmethod
def
get_image
(
self
)
->
dict
:
"""Transform data to image."""
pass
@
abstractmethod
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc page."""
pass
@
abstractmethod
def
get_page_info
(
self
)
->
PageInfo
:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
pass
@
abstractmethod
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
pass
@
abstractmethod
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
"""
pass
class
Dataset
(
ABC
):
@
abstractmethod
def
__len__
(
self
)
->
int
:
"""The length of the dataset."""
pass
@
abstractmethod
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page data."""
pass
@
abstractmethod
def
supported_methods
(
self
)
->
list
[
SupportedPdfParseMethod
]:
"""The methods that this dataset support.
Returns:
list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
"""
pass
@
abstractmethod
def
data_bits
(
self
)
->
bytes
:
"""The bits used to create this dataset."""
pass
@
abstractmethod
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""Get the page indexed by page_id.
Args:
page_id (int): the index of the page
Returns:
PageableData: the page doc object
"""
pass
@
abstractmethod
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
pass
@
abstractmethod
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(self, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
@
abstractmethod
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
pass
@
abstractmethod
def
clone
(
self
):
"""clone this dataset."""
pass
class
PymuDocDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
,
lang
=
None
):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the pdf
"""
self
.
_raw_fitz
=
fitz
.
open
(
'pdf'
,
bits
)
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_data_bits
=
bits
self
.
_raw_data
=
bits
self
.
_classify_result
=
None
if
lang
==
''
:
self
.
_lang
=
None
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
self
.
_data_bits
)
logger
.
info
(
f
'lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
self
.
_lang
=
lang
logger
.
info
(
f
'lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
"""The page number of the pdf."""
return
len
(
self
.
_records
)
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page doc object."""
return
iter
(
self
.
_records
)
def
supported_methods
(
self
)
->
list
[
SupportedPdfParseMethod
]:
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return
[
SupportedPdfParseMethod
.
OCR
,
SupportedPdfParseMethod
.
TXT
]
def
data_bits
(
self
)
->
bytes
:
"""The pdf bits used to create this dataset."""
return
self
.
_data_bits
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
if
'lang'
in
kwargs
and
self
.
_lang
is
not
None
:
kwargs
[
'lang'
]
=
self
.
_lang
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
if
self
.
_classify_result
is
None
:
self
.
_classify_result
=
classify
(
self
.
_data_bits
)
return
self
.
_classify_result
def
clone
(
self
):
"""clone this dataset."""
return
PymuDocDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
for
i
in
range
(
len
(
self
.
_records
)):
self
.
_records
[
i
].
set_image
(
images
[
i
])
class
ImageDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
,
lang
=
None
):
"""Initialize the dataset, which wraps the pymudoc documents.
Args:
bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
"""
pdf_bytes
=
fitz
.
open
(
stream
=
bits
).
convert_to_pdf
()
self
.
_raw_fitz
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
self
.
_records
=
[
Doc
(
v
)
for
v
in
self
.
_raw_fitz
]
self
.
_raw_data
=
bits
self
.
_data_bits
=
pdf_bytes
if
lang
==
''
:
self
.
_lang
=
None
elif
lang
==
'auto'
:
from
magic_pdf.model.sub_modules.language_detection.utils
import
\
auto_detect_lang
self
.
_lang
=
auto_detect_lang
(
self
.
_data_bits
)
logger
.
info
(
f
'lang:
{
lang
}
, detect_lang:
{
self
.
_lang
}
'
)
else
:
self
.
_lang
=
lang
logger
.
info
(
f
'lang:
{
lang
}
'
)
def
__len__
(
self
)
->
int
:
"""The length of the dataset."""
return
len
(
self
.
_records
)
def
__iter__
(
self
)
->
Iterator
[
PageableData
]:
"""Yield the page object."""
return
iter
(
self
.
_records
)
def
supported_methods
(
self
):
"""The method supported by this dataset.
Returns:
list[SupportedPdfParseMethod]: the supported methods
"""
return
[
SupportedPdfParseMethod
.
OCR
]
def
data_bits
(
self
)
->
bytes
:
"""The pdf bits used to create this dataset."""
return
self
.
_data_bits
def
get_page
(
self
,
page_id
:
int
)
->
PageableData
:
"""The page doc object.
Args:
page_id (int): the page doc index
Returns:
PageableData: the page doc object
"""
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file.
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset.
Returns:
SupportedPdfParseMethod: _description_
"""
return
SupportedPdfParseMethod
.
OCR
def
clone
(
self
):
"""clone this dataset."""
return
ImageDataset
(
self
.
_raw_data
)
def
set_images
(
self
,
images
):
for
i
in
range
(
len
(
self
.
_records
)):
self
.
_records
[
i
].
set_image
(
images
[
i
])
class
Doc
(
PageableData
):
"""Initialized with pymudoc object."""
def
__init__
(
self
,
doc
:
fitz
.
Page
):
self
.
_doc
=
doc
self
.
_img
=
None
def
get_image
(
self
):
"""Return the image info.
Returns:
dict: {
img: np.ndarray,
width: int,
height: int
}
"""
if
self
.
_img
is
None
:
self
.
_img
=
fitz_doc_to_image
(
self
.
_doc
)
return
self
.
_img
def
set_image
(
self
,
img
):
"""
Args:
img (np.ndarray): the image
"""
if
self
.
_img
is
None
:
self
.
_img
=
img
def
get_doc
(
self
)
->
fitz
.
Page
:
"""Get the pymudoc object.
Returns:
fitz.Page: the pymudoc object
"""
return
self
.
_doc
def
get_page_info
(
self
)
->
PageInfo
:
"""Get the page info of the page.
Returns:
PageInfo: the page info of this page
"""
page_w
=
self
.
_doc
.
rect
.
width
page_h
=
self
.
_doc
.
rect
.
height
return
PageInfo
(
w
=
page_w
,
h
=
page_h
)
def
__getattr__
(
self
,
name
):
if
hasattr
(
self
.
_doc
,
name
):
return
getattr
(
self
.
_doc
,
name
)
def
draw_rect
(
self
,
rect_coords
,
color
,
fill
,
fill_opacity
,
width
,
overlay
):
"""draw rectangle.
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
overlay (bool): fill the color in foreground or background. True means fill in background.
"""
self
.
_doc
.
draw_rect
(
rect_coords
,
color
=
color
,
fill
=
fill
,
fill_opacity
=
fill_opacity
,
width
=
width
,
overlay
=
overlay
,
)
def
insert_text
(
self
,
coord
,
content
,
fontsize
,
color
):
"""insert text.
Args:
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
fontsize (int): font size of the text
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
"""
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
\ No newline at end of file
magic_pdf/data/io/__init__.py
deleted
100644 → 0
View file @
ddf5a878
from
magic_pdf.data.io.base
import
IOReader
,
IOWriter
# noqa: F401
from
magic_pdf.data.io.http
import
HttpReader
,
HttpWriter
# noqa: F401
from
magic_pdf.data.io.s3
import
S3Reader
,
S3Writer
# noqa: F401
__all__
=
[
'IOReader'
,
'IOWriter'
,
'HttpReader'
,
'HttpWriter'
,
'S3Reader'
,
'S3Writer'
]
\ No newline at end of file
magic_pdf/data/read_api.py
deleted
100644 → 0
View file @
ddf5a878
import
json
import
os
import
tempfile
import
shutil
from
pathlib
import
Path
from
magic_pdf.config.exceptions
import
EmptyData
,
InvalidParams
from
magic_pdf.data.data_reader_writer
import
(
FileBasedDataReader
,
MultiBucketS3DataReader
)
from
magic_pdf.data.dataset
import
ImageDataset
,
PymuDocDataset
from
magic_pdf.utils.office_to_pdf
import
convert_file_to_pdf
,
ConvertToPdfError
def
read_jsonl
(
s3_path_or_local
:
str
,
s3_client
:
MultiBucketS3DataReader
|
None
=
None
)
->
list
[
PymuDocDataset
]:
"""Read the jsonl file and return the list of PymuDocDataset.
Args:
s3_path_or_local (str): local file or s3 path
s3_client (MultiBucketS3DataReader | None, optional): s3 client that support multiple bucket. Defaults to None.
Raises:
InvalidParams: if s3_path_or_local is s3 path but s3_client is not provided.
EmptyData: if no pdf file location is provided in some line of jsonl file.
InvalidParams: if the file location is s3 path but s3_client is not provided
Returns:
list[PymuDocDataset]: each line in the jsonl file will be converted to a PymuDocDataset
"""
bits_arr
=
[]
if
s3_path_or_local
.
startswith
(
's3://'
):
if
s3_client
is
None
:
raise
InvalidParams
(
's3_client is required when s3_path is provided'
)
jsonl_bits
=
s3_client
.
read
(
s3_path_or_local
)
else
:
jsonl_bits
=
FileBasedDataReader
(
''
).
read
(
s3_path_or_local
)
jsonl_d
=
[
json
.
loads
(
line
)
for
line
in
jsonl_bits
.
decode
().
split
(
'
\n
'
)
if
line
.
strip
()
]
for
d
in
jsonl_d
:
pdf_path
=
d
.
get
(
'file_location'
,
''
)
or
d
.
get
(
'path'
,
''
)
if
len
(
pdf_path
)
==
0
:
raise
EmptyData
(
'pdf file location is empty'
)
if
pdf_path
.
startswith
(
's3://'
):
if
s3_client
is
None
:
raise
InvalidParams
(
's3_client is required when s3_path is provided'
)
bits_arr
.
append
(
s3_client
.
read
(
pdf_path
))
else
:
bits_arr
.
append
(
FileBasedDataReader
(
''
).
read
(
pdf_path
))
return
[
PymuDocDataset
(
bits
)
for
bits
in
bits_arr
]
def
read_local_pdfs
(
path
:
str
)
->
list
[
PymuDocDataset
]:
"""Read pdf from path or directory.
Args:
path (str): pdf file path or directory that contains pdf files
Returns:
list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
"""
if
os
.
path
.
isdir
(
path
):
reader
=
FileBasedDataReader
()
ret
=
[]
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
file
.
split
(
'.'
)
if
suffix
[
-
1
]
==
'pdf'
:
ret
.
append
(
PymuDocDataset
(
reader
.
read
(
os
.
path
.
join
(
root
,
file
))))
return
ret
else
:
reader
=
FileBasedDataReader
()
bits
=
reader
.
read
(
path
)
return
[
PymuDocDataset
(
bits
)]
def
read_local_office
(
path
:
str
)
->
list
[
PymuDocDataset
]:
"""Read ms-office file (ppt, pptx, doc, docx) from path or directory.
Args:
path (str): ms-office file or directory that contains ms-office files
Returns:
list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
Raises:
ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
FileNotFoundError: File not Found
Exception: Unknown Exception raised
"""
suffixes
=
[
'.ppt'
,
'.pptx'
,
'.doc'
,
'.docx'
]
fns
=
[]
ret
=
[]
if
os
.
path
.
isdir
(
path
):
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
Path
(
file
).
suffix
if
suffix
in
suffixes
:
fns
.
append
((
os
.
path
.
join
(
root
,
file
)))
else
:
fns
.
append
(
path
)
reader
=
FileBasedDataReader
()
temp_dir
=
tempfile
.
mkdtemp
()
for
fn
in
fns
:
try
:
convert_file_to_pdf
(
fn
,
temp_dir
)
except
ConvertToPdfError
as
e
:
raise
e
except
FileNotFoundError
as
e
:
raise
e
except
Exception
as
e
:
raise
e
fn_path
=
Path
(
fn
)
pdf_fn
=
f
"
{
temp_dir
}
/
{
fn_path
.
stem
}
.pdf"
ret
.
append
(
PymuDocDataset
(
reader
.
read
(
pdf_fn
)))
shutil
.
rmtree
(
temp_dir
)
return
ret
def
read_local_images
(
path
:
str
,
suffixes
:
list
[
str
]
=
[
'.png'
,
'.jpg'
,
'.jpeg'
])
->
list
[
ImageDataset
]:
"""Read images from path or directory.
Args:
path (str): image file path or directory that contains image files
suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
Returns:
list[ImageDataset]: each image file will converted to a ImageDataset
"""
if
os
.
path
.
isdir
(
path
):
imgs_bits
=
[]
s_suffixes
=
set
(
suffixes
)
reader
=
FileBasedDataReader
()
for
root
,
_
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
suffix
=
Path
(
file
).
suffix
if
suffix
in
s_suffixes
:
imgs_bits
.
append
(
reader
.
read
(
os
.
path
.
join
(
root
,
file
)))
return
[
ImageDataset
(
bits
)
for
bits
in
imgs_bits
]
else
:
reader
=
FileBasedDataReader
()
bits
=
reader
.
read
(
path
)
return
[
ImageDataset
(
bits
)]
magic_pdf/data/utils.py
deleted
100644 → 0
View file @
ddf5a878
import
multiprocessing
as
mp
import
threading
from
concurrent.futures
import
(
ProcessPoolExecutor
,
ThreadPoolExecutor
,
as_completed
)
import
fitz
import
numpy
as
np
from
loguru
import
logger
def
fitz_doc_to_image
(
page
,
dpi
=
200
)
->
dict
:
"""Convert fitz.Document to image, Then convert the image to numpy array.
Args:
page (_type_): pymudoc page
dpi (int, optional): reset the dpi of dpi. Defaults to 200.
Returns:
dict: {'img': numpy array, 'width': width, 'height': height }
"""
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 4500 after scaling, do not scale further.
if
pm
.
width
>
4500
or
pm
.
height
>
4500
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# Convert pixmap samples directly to numpy array
img
=
np
.
frombuffer
(
pm
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pm
.
height
,
pm
.
width
,
3
)
img_dict
=
{
'img'
:
img
,
'width'
:
pm
.
width
,
'height'
:
pm
.
height
}
return
img_dict
def
load_images_from_pdf
(
pdf_bytes
:
bytes
,
dpi
=
200
,
start_page_id
=
0
,
end_page_id
=
None
)
->
list
:
images
=
[]
with
fitz
.
open
(
'pdf'
,
pdf_bytes
)
as
doc
:
pdf_page_num
=
doc
.
page_count
end_page_id
=
(
end_page_id
if
end_page_id
is
not
None
and
end_page_id
>=
0
else
pdf_page_num
-
1
)
if
end_page_id
>
pdf_page_num
-
1
:
logger
.
warning
(
'end_page_id is out of range, use images length'
)
end_page_id
=
pdf_page_num
-
1
for
index
in
range
(
0
,
doc
.
page_count
):
if
start_page_id
<=
index
<=
end_page_id
:
page
=
doc
[
index
]
mat
=
fitz
.
Matrix
(
dpi
/
72
,
dpi
/
72
)
pm
=
page
.
get_pixmap
(
matrix
=
mat
,
alpha
=
False
)
# If the width or height exceeds 4500 after scaling, do not scale further.
if
pm
.
width
>
4500
or
pm
.
height
>
4500
:
pm
=
page
.
get_pixmap
(
matrix
=
fitz
.
Matrix
(
1
,
1
),
alpha
=
False
)
# Convert pixmap samples directly to numpy array
img
=
np
.
frombuffer
(
pm
.
samples
,
dtype
=
np
.
uint8
).
reshape
(
pm
.
height
,
pm
.
width
,
3
)
img_dict
=
{
'img'
:
img
,
'width'
:
pm
.
width
,
'height'
:
pm
.
height
}
else
:
img_dict
=
{
'img'
:
[],
'width'
:
0
,
'height'
:
0
}
images
.
append
(
img_dict
)
return
images
def
convert_page
(
bytes_page
):
pdfs
=
fitz
.
open
(
'pdf'
,
bytes_page
)
page
=
pdfs
[
0
]
return
fitz_doc_to_image
(
page
)
def
parallel_process_pdf_safe
(
pages
,
num_workers
=
None
,
**
kwargs
):
"""Process PDF pages in parallel with serialization-safe approach."""
if
num_workers
is
None
:
num_workers
=
mp
.
cpu_count
()
# Process the extracted page data in parallel
with
ProcessPoolExecutor
(
max_workers
=
num_workers
)
as
executor
:
# Process the page data
results
=
list
(
executor
.
map
(
convert_page
,
pages
)
)
return
results
def
threaded_process_pdf
(
pdf_path
,
num_threads
=
4
,
**
kwargs
):
"""Process all pages of a PDF using multiple threads.
Parameters:
-----------
pdf_path : str
Path to the PDF file
num_threads : int
Number of threads to use
**kwargs :
Additional arguments for fitz_doc_to_image
Returns:
--------
images : list
List of processed images, in page order
"""
# Open the PDF
doc
=
fitz
.
open
(
pdf_path
)
num_pages
=
len
(
doc
)
# Create a list to store results in the correct order
results
=
[
None
]
*
num_pages
# Create a thread pool
with
ThreadPoolExecutor
(
max_workers
=
num_threads
)
as
executor
:
# Submit all tasks
futures
=
{}
for
page_num
in
range
(
num_pages
):
page
=
doc
[
page_num
]
future
=
executor
.
submit
(
fitz_doc_to_image
,
page
,
**
kwargs
)
futures
[
future
]
=
page_num
# Process results as they complete with progress bar
for
future
in
as_completed
(
futures
):
page_num
=
futures
[
future
]
try
:
results
[
page_num
]
=
future
.
result
()
except
Exception
as
e
:
print
(
f
'Error processing page
{
page_num
}
:
{
e
}
'
)
results
[
page_num
]
=
None
# Close the document
doc
.
close
()
if
__name__
==
'__main__'
:
pdf
=
fitz
.
open
(
'/tmp/[MS-DOC].pdf'
)
pdf_page
=
[
fitz
.
open
()
for
i
in
range
(
pdf
.
page_count
)]
[
pdf_page
[
i
].
insert_pdf
(
pdf
,
from_page
=
i
,
to_page
=
i
)
for
i
in
range
(
pdf
.
page_count
)]
pdf_page
=
[
v
.
tobytes
()
for
v
in
pdf_page
]
results
=
parallel_process_pdf_safe
(
pdf_page
,
num_workers
=
16
)
# threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
""" benchmark results of multi-threaded processing (fitz page to image)
total page nums: 578
thread nums, time cost
1 7.351 sec
2 6.334 sec
4 5.968 sec
8 6.728 sec
16 8.085 sec
"""
""" benchmark results of multi-processor processing (fitz page to image)
total page nums: 578
processor nums, time cost
1 17.170 sec
2 10.170 sec
4 7.841 sec
8 7.900 sec
16 7.984 sec
"""
magic_pdf/filter/__init__.py
deleted
100644 → 0
View file @
ddf5a878
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.filter.pdf_classify_by_type
import
classify
as
do_classify
from
magic_pdf.filter.pdf_meta_scan
import
pdf_meta_scan
def
classify
(
pdf_bytes
:
bytes
)
->
SupportedPdfParseMethod
:
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
pdf_meta
=
pdf_meta_scan
(
pdf_bytes
)
if
pdf_meta
.
get
(
'_need_drop'
,
False
):
# 如果返回了需要丢弃的标志,则抛出异常
raise
Exception
(
f
"pdf meta_scan need_drop,reason is
{
pdf_meta
[
'_drop_reason'
]
}
"
)
else
:
is_encrypted
=
pdf_meta
[
'is_encrypted'
]
is_needs_password
=
pdf_meta
[
'is_needs_password'
]
if
is_encrypted
or
is_needs_password
:
# 加密的,需要密码的,没有页面的,都不处理
raise
Exception
(
f
'pdf meta_scan need_drop,reason is
{
DropReason
.
ENCRYPTED
}
'
)
else
:
is_text_pdf
,
results
=
do_classify
(
pdf_meta
[
'total_page'
],
pdf_meta
[
'page_width_pts'
],
pdf_meta
[
'page_height_pts'
],
pdf_meta
[
'image_info_per_page'
],
pdf_meta
[
'text_len_per_page'
],
pdf_meta
[
'imgs_per_page'
],
# pdf_meta['text_layout_per_page'],
pdf_meta
[
'invalid_chars'
],
)
if
is_text_pdf
:
return
SupportedPdfParseMethod
.
TXT
else
:
return
SupportedPdfParseMethod
.
OCR
magic_pdf/filter/pdf_classify_by_type.py
deleted
100644 → 0
View file @
ddf5a878
"""
根据利用meta_scan得到的结果,对pdf是否为文字版进行分类。
定义标准:
一、什么pdf会是文字pdf,只要满足以下任意一条
1. 随机抽取N页,如果有任何一页文字数目大于100
2. 只要存在一个页面,图片的数量为0
二、什么是扫描版pdf,只要满足以下任意一条
1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~
2. 大部分页面上文字的长度都是相等的。
"""
import
json
import
sys
from
collections
import
Counter
import
click
import
numpy
as
np
from
loguru
import
logger
from
magic_pdf.libs.commons
import
mymax
,
get_top_percent_list
from
magic_pdf.filter.pdf_meta_scan
import
scan_max_page
,
junk_limit_min
TEXT_LEN_THRESHOLD
=
100
AVG_TEXT_LEN_THRESHOLD
=
100
TEXT_LEN_SAMPLE_RATIO
=
0.1
# 抽取0.1的页面进行文字长度统计
# 一个拼接图片的方案,将某些特殊扫描版本的拆图拼成一张整图
def
merge_images
(
image_list
,
page_width
,
page_height
,
max_offset
=
5
,
max_gap
=
2
):
# 先通过set去除所有bbox重叠的图片数据
image_list_result
=
[]
for
page_images
in
image_list
:
page_result
=
[]
dedup
=
set
()
for
img
in
page_images
:
x0
,
y0
,
x1
,
y1
,
img_bojid
=
img
if
(
x0
,
y0
,
x1
,
y1
)
in
dedup
:
# 这里面会出现一些重复的bbox,无需重复出现,需要去掉
continue
else
:
dedup
.
add
((
x0
,
y0
,
x1
,
y1
))
page_result
.
append
([
x0
,
y0
,
x1
,
y1
,
img_bojid
])
image_list_result
.
append
(
page_result
)
# 接下来,将同一页可拼接的图片进行合并
merged_images
=
[]
for
page_images
in
image_list_result
:
if
not
page_images
:
continue
# 先将同一页的图片从上到下,从左到右进行排序
page_images
.
sort
(
key
=
lambda
img
:
(
img
[
1
],
img
[
0
]))
merged
=
[
page_images
[
0
]]
for
img
in
page_images
[
1
:]:
x0
,
y0
,
x1
,
y1
,
imgid
=
img
last_img
=
merged
[
-
1
]
last_x0
,
last_y0
,
last_x1
,
last_y1
,
last_imgid
=
last_img
# 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件
full_width
=
abs
(
x1
-
x0
)
>=
page_width
*
0.9
full_height
=
abs
(
y1
-
y0
)
>=
page_height
*
0.9
# 如果宽达标,检测是否能竖着拼
if
full_width
:
# 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
close1
=
(
last_x0
-
max_offset
)
<=
x0
<=
(
last_x0
+
max_offset
)
and
(
last_x1
-
max_offset
)
<=
x1
<=
(
last_x1
+
max_offset
)
and
(
last_y1
-
max_gap
)
<=
y0
<=
(
last_y1
+
max_gap
)
# 如果高达标,检测是否可以横着拼
if
full_height
:
# 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
close2
=
(
last_y0
-
max_offset
)
<=
y0
<=
(
last_y0
+
max_offset
)
and
(
last_y1
-
max_offset
)
<=
y1
<=
(
last_y1
+
max_offset
)
and
(
last_x1
-
max_gap
)
<=
x0
<=
(
last_x1
+
max_gap
)
# Check if the image can be merged with the last image
if
(
full_width
and
close1
)
or
(
full_height
and
close2
):
# Merge the image with the last image
merged
[
-
1
]
=
[
min
(
x0
,
last_x0
),
min
(
y0
,
last_y0
),
max
(
x1
,
last_x1
),
max
(
y1
,
last_y1
),
imgid
]
else
:
# Add the image as a new image
merged
.
append
(
img
)
merged_images
.
append
(
merged
)
return
merged_images
def
classify_by_area
(
total_page
:
int
,
page_width
,
page_height
,
img_sz_list
,
text_len_list
:
list
):
"""
80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False,否则返回True
:param pdf_path:
:param total_page:
:param page_width:
:param page_height:
:param img_sz_list:
:return:
"""
# # 只要有一页没有图片,那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf,上面有一些空白页面,既没有图片也没有文字。
# if any([len(img_sz) == 0 for img_sz in img_sz_list]): # 含有不含图片的页面
# # 现在找到这些页面的index
# empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0]
# # 然后检查这些页面上是否有文字
# text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0]
# if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD: # 没有图片,但是有文字,说明可能是个文字版,如果没有文字则无法判断,留给下一步,现在要求这页文字量超过一定阈值
# return True
# 通过objid去掉重复出现10次以上的图片,这些图片是隐藏的透明图层,其特点是id都一样
# 先对每个id出现的次数做个统计
objid_cnt
=
Counter
([
objid
for
page_img_sz
in
img_sz_list
for
_
,
_
,
_
,
_
,
objid
in
page_img_sz
])
# 再去掉出现次数大于10的
if
total_page
>=
scan_max_page
:
# 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
total_page
=
scan_max_page
repeat_threshold
=
2
# 把bad_image的阈值设为2
# repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
bad_image_objid
=
set
([
objid
for
objid
,
cnt
in
objid_cnt
.
items
()
if
cnt
>=
repeat_threshold
])
# bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])]
# text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0]
# 特殊情况,一个文字版pdf,每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上
# fake_image_ids = [objid for objid in bad_image_objid if
# any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for
# x0, y0, x1, y1, _ in images])] # 原来的代码,any里面恒为true了,原因???
# fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images
# if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9]
# if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
# return True
img_sz_list
=
[[
img_sz
for
img_sz
in
page_img_sz
if
img_sz
[
-
1
]
not
in
bad_image_objid
]
for
page_img_sz
in
img_sz_list
]
# 过滤掉重复出现的图片
# 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
img_sz_list
=
merge_images
(
img_sz_list
,
page_width
,
page_height
)
# 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
max_image_area_per_page
=
[
mymax
([(
x1
-
x0
)
*
(
y1
-
y0
)
for
x0
,
y0
,
x1
,
y1
,
_
in
page_img_sz
])
for
page_img_sz
in
img_sz_list
]
page_area
=
page_width
*
page_height
max_image_area_per_page
=
[
area
/
page_area
for
area
in
max_image_area_per_page
]
max_image_area_per_page
=
[
area
for
area
in
max_image_area_per_page
if
area
>
0.5
]
if
len
(
max_image_area_per_page
)
>=
0.5
*
total_page
:
# 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
# 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
return
False
else
:
return
True
def
classify_by_text_len
(
text_len_list
:
list
,
total_page
:
int
):
"""
随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
查看页面上的文字长度,如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD,那么就是文字pdf
:param total_page:
:param text_len_list:
:return:
"""
select_page_cnt
=
int
(
total_page
*
TEXT_LEN_SAMPLE_RATIO
)
# 选取10%的页面
if
select_page_cnt
<
5
:
select_page_cnt
=
total_page
# # 排除头尾各10页
# if total_page > 20: # 如果总页数大于20
# page_range = list(range(10, total_page - 10)) # 从第11页到倒数第11页
# else:
# page_range = list(range(total_page)) # 否则选择所有页面
# page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False)
# 排除前后10页对只有21,22页的pdf很尴尬,如果选出来的中间那一两页恰好没字容易误判,有了avg_words规则,这个规则可以忽略
page_num
=
np
.
random
.
choice
(
total_page
,
select_page_cnt
,
replace
=
False
)
text_len_lst
=
[
text_len_list
[
i
]
for
i
in
page_num
]
is_text_pdf
=
any
([
text_len
>
TEXT_LEN_THRESHOLD
for
text_len
in
text_len_lst
])
return
is_text_pdf
def
classify_by_avg_words
(
text_len_list
:
list
):
"""
补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
主要是各种图集
:param text_len_list:
:return:
"""
sum_words
=
sum
(
text_len_list
)
count_of_numbers
=
len
(
text_len_list
)
if
count_of_numbers
==
0
:
is_text_pdf
=
False
else
:
avg_words
=
round
(
sum_words
/
count_of_numbers
)
if
avg_words
>
AVG_TEXT_LEN_THRESHOLD
:
is_text_pdf
=
True
else
:
is_text_pdf
=
False
return
is_text_pdf
def
classify_by_img_num
(
img_sz_list
:
list
,
img_num_list
:
list
):
"""
补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素,img_num_list中每一页的数量都很大且相同
:param img_sz_list:
:param img_num_list:
:return:
"""
# 计算img_sz_list中非空元素的个数
count_img_sz_list_not_none
=
sum
(
1
for
item
in
img_sz_list
if
item
)
# 获取前80%的元素
top_eighty_percent
=
get_top_percent_list
(
img_num_list
,
0.8
)
# img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
if
count_img_sz_list_not_none
<=
1
and
len
(
set
(
top_eighty_percent
))
==
1
and
max
(
img_num_list
)
>=
junk_limit_min
:
#拿max和min的值,用来判断list内的值是否全都相等
# min_imgs = min(img_num_list)
# max_imgs = max(img_num_list)
#
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
return
False
# 如果满足这个条件,一定不是文字版pdf
else
:
return
True
# 不满足这三个条件,可能是文字版pdf,通过其他规则判断
def
classify_by_text_layout
(
text_layout_per_page
:
list
):
"""
判断文本布局是否以竖排为主。
Args:
text_layout_per_page (list): 文本布局列表,列表中的每个元素表示一页的文本布局,
值为'vertical'表示竖排,值为'horizontal'表示横排。
Returns:
bool: 若文本布局以竖排为主,则返回False;否则返回True。
"""
# 统计text_layout_per_page中竖排的个数
count_vertical
=
sum
(
1
for
item
in
text_layout_per_page
if
item
==
'vertical'
)
# 统计text_layout_per_page中横排的个数
count_horizontal
=
sum
(
1
for
item
in
text_layout_per_page
if
item
==
'horizontal'
)
# 计算text_layout_per_page中竖排的占比
known_layout_cnt
=
count_vertical
+
count_horizontal
if
known_layout_cnt
!=
0
:
ratio
=
count_vertical
/
known_layout_cnt
if
ratio
>=
0.5
:
# 阈值设为0.5,适配3页里面有2页和两页里有一页的情况
return
False
# 文本布局以竖排为主,认为不是文字版pdf
else
:
return
True
# 文本布局以横排为主,认为是文字版pdf
else
:
return
False
# 文本布局未知,默认认为不是文字版pdf
def
classify_by_img_narrow_strips
(
page_width
,
page_height
,
img_sz_list
):
"""
判断一页是否由细长条组成,有两个条件:
1. 图片的宽或高达到页面宽或高的90%,且长边需要是窄边长度的数倍以上
2. 整个页面所有的图片有80%以上满足条件1
Args:
page_width (float): 页面宽度
page_height (float): 页面高度
img_sz_list (list): 图片尺寸列表,每个元素为一个元组,表示图片的矩形区域和尺寸,形如(x0, y0, x1, y1, size),其中(x0, y0)为矩形区域的左上角坐标,(x1, y1)为矩形区域的右下角坐标,size为图片的尺寸
Returns:
bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
"""
def
is_narrow_strip
(
img
):
x0
,
y0
,
x1
,
y1
,
_
=
img
width
,
height
=
x1
-
x0
,
y1
-
y0
return
any
([
# 图片宽度大于等于页面宽度的90%,且宽度大于等于高度4倍
width
>=
page_width
*
0.9
and
width
>=
height
*
4
,
# 图片高度大于等于页面高度的90%,且高度大于等于宽度4倍
height
>=
page_height
*
0.9
and
height
>=
width
*
4
,
])
# 初始化满足条件的页面数量
narrow_strip_pages_count
=
0
# 遍历所有页面
for
page_img_list
in
img_sz_list
:
# 忽略空页面
if
not
page_img_list
:
continue
# 计算页面中的图片总数
total_images
=
len
(
page_img_list
)
# 计算页面中细长条图片的数量
narrow_strip_images_count
=
0
for
img
in
page_img_list
:
if
is_narrow_strip
(
img
):
narrow_strip_images_count
+=
1
# 如果细长条图片的数量少于5,跳过
if
narrow_strip_images_count
<
5
:
continue
else
:
# 如果细长条图片的比例大于或等于0.8,增加满足条件的页面数量
if
narrow_strip_images_count
/
total_images
>=
0.8
:
narrow_strip_pages_count
+=
1
# 计算满足条件的页面的比例
narrow_strip_pages_ratio
=
narrow_strip_pages_count
/
len
(
img_sz_list
)
return
narrow_strip_pages_ratio
<
0.5
def
classify
(
total_page
:
int
,
page_width
,
page_height
,
img_sz_list
:
list
,
text_len_list
:
list
,
img_num_list
:
list
,
# text_layout_list: list,
invalid_chars
:
bool
):
"""
这里的图片和页面长度单位是pts
:param total_page:
:param text_len_list:
:param page_width:
:param page_height:
:param img_sz_list:
:param pdf_path:
:return:
"""
results
=
{
'by_image_area'
:
classify_by_area
(
total_page
,
page_width
,
page_height
,
img_sz_list
,
text_len_list
),
'by_text_len'
:
classify_by_text_len
(
text_len_list
,
total_page
),
'by_avg_words'
:
classify_by_avg_words
(
text_len_list
),
'by_img_num'
:
classify_by_img_num
(
img_sz_list
,
img_num_list
),
# 'by_text_layout': classify_by_text_layout(text_layout_list),
'by_img_narrow_strips'
:
classify_by_img_narrow_strips
(
page_width
,
page_height
,
img_sz_list
),
'by_invalid_chars'
:
invalid_chars
,
}
if
all
(
results
.
values
()):
return
True
,
results
elif
not
any
(
results
.
values
()):
return
False
,
results
else
:
logger
.
warning
(
f
"OCR needed based on classification result, by_image_area:
{
results
[
'by_image_area'
]
}
,"
f
" by_text:
{
results
[
'by_text_len'
]
}
, by_avg_words:
{
results
[
'by_avg_words'
]
}
, by_img_num:
{
results
[
'by_img_num'
]
}
,"
# f" by_text_layout: {results['by_text_layout']},"
f
" by_img_narrow_strips:
{
results
[
'by_img_narrow_strips'
]
}
,"
f
" by_invalid_chars:
{
results
[
'by_invalid_chars'
]
}
"
,
file
=
sys
.
stderr
)
# 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
return
False
,
results
@
click
.
command
()
@
click
.
option
(
"--json-file"
,
type
=
str
,
help
=
"pdf信息"
)
def
main
(
json_file
):
if
json_file
is
None
:
print
(
"json_file is None"
,
file
=
sys
.
stderr
)
exit
(
0
)
try
:
with
open
(
json_file
,
"r"
)
as
f
:
for
l
in
f
:
if
l
.
strip
()
==
""
:
continue
o
=
json
.
loads
(
l
)
total_page
=
o
[
"total_page"
]
page_width
=
o
[
"page_width_pts"
]
page_height
=
o
[
"page_height_pts"
]
img_sz_list
=
o
[
"image_info_per_page"
]
text_len_list
=
o
[
'text_len_per_page'
]
text_layout_list
=
o
[
'text_layout_per_page'
]
pdf_path
=
o
[
'pdf_path'
]
is_encrypted
=
o
[
'is_encrypted'
]
is_needs_password
=
o
[
'is_needs_password'
]
if
is_encrypted
or
total_page
==
0
or
is_needs_password
:
# 加密的,需要密码的,没有页面的,都不处理
continue
tag
=
classify
(
total_page
,
page_width
,
page_height
,
img_sz_list
,
text_len_list
,
text_layout_list
)
o
[
'is_text_pdf'
]
=
tag
print
(
json
.
dumps
(
o
,
ensure_ascii
=
False
))
except
Exception
as
e
:
print
(
"ERROR: "
,
e
,
file
=
sys
.
stderr
)
if
__name__
==
"__main__"
:
main
()
# false = False
# true = True
# null = None
# o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}}
# o = json.loads(json.dumps(o))
# total_page = o["total_page"]
# page_width = o["page_width_pts"]
# page_height = o["page_height_pts"]
# img_sz_list = o["image_info_per_page"]
# text_len_list = o['text_len_per_page']
# pdf_path = o['pdf_path']
# is_encrypted = o['is_encrypted']
# is_needs_password = o['is_needs_password']
# if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
# print("加密的")
# exit(0)
# tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list)
# o['is_text_pdf'] = tag
# print(json.dumps(o, ensure_ascii=False))
magic_pdf/filter/pdf_meta_scan.py
deleted
100644 → 0
View file @
ddf5a878
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
from
collections
import
Counter
import
fitz
from
loguru
import
logger
from
magic_pdf.config.drop_reason
import
DropReason
from
magic_pdf.libs.commons
import
get_top_percent_list
,
mymax
from
magic_pdf.libs.language
import
detect_lang
from
magic_pdf.libs.pdf_check
import
detect_invalid_chars_by_pymupdf
,
detect_invalid_chars
scan_max_page
=
50
junk_limit_min
=
10
def
calculate_max_image_area_per_page
(
result
:
list
,
page_width_pts
,
page_height_pts
):
max_image_area_per_page
=
[
mymax
([(
x1
-
x0
)
*
(
y1
-
y0
)
for
x0
,
y0
,
x1
,
y1
,
_
in
page_img_sz
])
for
page_img_sz
in
result
]
page_area
=
int
(
page_width_pts
)
*
int
(
page_height_pts
)
max_image_area_per_page
=
[
area
/
page_area
for
area
in
max_image_area_per_page
]
max_image_area_per_page
=
[
area
for
area
in
max_image_area_per_page
if
area
>
0.6
]
return
max_image_area_per_page
def
process_image
(
page
,
junk_img_bojids
=
[]):
page_result
=
[]
# 存每个页面里的多张图四元组信息
items
=
page
.
get_images
()
dedup
=
set
()
for
img
in
items
:
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
img_bojid
=
img
[
0
]
# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
if
img_bojid
in
junk_img_bojids
:
# 如果是垃圾图像,就跳过
continue
recs
=
page
.
get_image_rects
(
img
,
transform
=
True
)
if
recs
:
rec
=
recs
[
0
][
0
]
x0
,
y0
,
x1
,
y1
=
map
(
int
,
rec
)
width
=
x1
-
x0
height
=
y1
-
y0
if
(
x0
,
y0
,
x1
,
y1
,
img_bojid
,
)
in
dedup
:
# 这里面会出现一些重复的bbox,无需重复出现,需要去掉
continue
if
not
all
(
[
width
,
height
]
):
# 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义
continue
dedup
.
add
((
x0
,
y0
,
x1
,
y1
,
img_bojid
))
page_result
.
append
([
x0
,
y0
,
x1
,
y1
,
img_bojid
])
return
page_result
def
get_image_info
(
doc
:
fitz
.
Document
,
page_width_pts
,
page_height_pts
)
->
list
:
"""返回每个页面里的图片的四元组,每个页面多个图片。
:param doc:
:return:
"""
# 使用 Counter 计数 img_bojid 的出现次数
img_bojid_counter
=
Counter
(
img
[
0
]
for
page
in
doc
for
img
in
page
.
get_images
())
# 找出出现次数超过 len(doc) 半数的 img_bojid
junk_limit
=
max
(
len
(
doc
)
*
0.5
,
junk_limit_min
)
# 对一些页数比较少的进行豁免
junk_img_bojids
=
[
img_bojid
for
img_bojid
,
count
in
img_bojid_counter
.
items
()
if
count
>=
junk_limit
]
# todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多
# 有两种扫描版,一种文字版,这里可能会有误判
# 扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张
# 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
# 文 字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
imgs_len_list
=
[
len
(
page
.
get_images
())
for
page
in
doc
]
special_limit_pages
=
10
# 统一用前十页结果做判断
result
=
[]
break_loop
=
False
for
i
,
page
in
enumerate
(
doc
):
if
break_loop
:
break
if
i
>=
special_limit_pages
:
break
page_result
=
process_image
(
page
)
# 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析
result
.
append
(
page_result
)
for
item
in
result
:
if
not
any
(
item
):
# 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
if
(
max
(
imgs_len_list
)
==
min
(
imgs_len_list
)
and
max
(
imgs_len_list
)
>=
junk_limit_min
):
# 如果是特殊文字版,就把junklist置空并break
junk_img_bojids
=
[]
else
:
# 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
pass
break_loop
=
True
break
if
not
break_loop
:
# 获取前80%的元素
top_eighty_percent
=
get_top_percent_list
(
imgs_len_list
,
0.8
)
# 检查前80%的元素是否都相等
if
len
(
set
(
top_eighty_percent
))
==
1
and
max
(
imgs_len_list
)
>=
junk_limit_min
:
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
# 前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
max_image_area_per_page
=
calculate_max_image_area_per_page
(
result
,
page_width_pts
,
page_height_pts
)
if
(
len
(
max_image_area_per_page
)
<
0.8
*
special_limit_pages
):
# 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
junk_img_bojids
=
[]
else
:
# 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
pass
else
:
# 每页图片数量不一致,需要清掉junklist全量跑前50页图片
junk_img_bojids
=
[]
# 正式进入取前50页图片的信息流程
result
=
[]
for
i
,
page
in
enumerate
(
doc
):
if
i
>=
scan_max_page
:
break
page_result
=
process_image
(
page
,
junk_img_bojids
)
# logger.info(f"page {i} img_len: {len(page_result)}")
result
.
append
(
page_result
)
return
result
,
junk_img_bojids
def
get_pdf_page_size_pts
(
doc
:
fitz
.
Document
):
page_cnt
=
len
(
doc
)
l
:
int
=
min
(
page_cnt
,
50
)
# 把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了)
page_width_list
=
[]
page_height_list
=
[]
for
i
in
range
(
l
):
page
=
doc
[
i
]
page_rect
=
page
.
rect
page_width_list
.
append
(
page_rect
.
width
)
page_height_list
.
append
(
page_rect
.
height
)
page_width_list
.
sort
()
page_height_list
.
sort
()
median_width
=
page_width_list
[
len
(
page_width_list
)
//
2
]
median_height
=
page_height_list
[
len
(
page_height_list
)
//
2
]
return
median_width
,
median_height
def
get_pdf_textlen_per_page
(
doc
:
fitz
.
Document
):
text_len_lst
=
[]
for
page
in
doc
:
# 拿包含img和text的所有blocks
# text_block = page.get_text("blocks")
# 拿所有text的blocks
# text_block = page.get_text("words")
# text_block_len = sum([len(t[4]) for t in text_block])
# 拿所有text的str
text_block
=
page
.
get_text
(
'text'
)
text_block_len
=
len
(
text_block
)
# logger.info(f"page {page.number} text_block_len: {text_block_len}")
text_len_lst
.
append
(
text_block_len
)
return
text_len_lst
def
get_pdf_text_layout_per_page
(
doc
:
fitz
.
Document
):
"""根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
List[str]: 每一页的文本布局(横向、纵向、未知)。
"""
text_layout_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
if
page_id
>=
scan_max_page
:
break
# 创建每一页的纵向和横向的文本行数计数器
vertical_count
=
0
horizontal_count
=
0
text_dict
=
page
.
get_text
(
'dict'
)
if
'blocks'
in
text_dict
:
for
block
in
text_dict
[
'blocks'
]:
if
'lines'
in
block
:
for
line
in
block
[
'lines'
]:
# 获取line的bbox顶点坐标
x0
,
y0
,
x1
,
y1
=
line
[
'bbox'
]
# 计算bbox的宽高
width
=
x1
-
x0
height
=
y1
-
y0
# 计算bbox的面积
area
=
width
*
height
font_sizes
=
[]
for
span
in
line
[
'spans'
]:
if
'size'
in
span
:
font_sizes
.
append
(
span
[
'size'
])
if
len
(
font_sizes
)
>
0
:
average_font_size
=
sum
(
font_sizes
)
/
len
(
font_sizes
)
else
:
average_font_size
=
(
10
# 有的line拿不到font_size,先定一个阈值100
)
if
(
area
<=
average_font_size
**
2
):
# 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
continue
else
:
if
'wmode'
in
line
:
# 通过wmode判断文本方向
if
line
[
'wmode'
]
==
1
:
# 判断是否为竖向文本
vertical_count
+=
1
elif
line
[
'wmode'
]
==
0
:
# 判断是否为横向文本
horizontal_count
+=
1
# if 'dir' in line: # 通过旋转角度计算判断文本方向
# # 获取行的 "dir" 值
# dir_value = line['dir']
# cosine, sine = dir_value
# # 计算角度
# angle = math.degrees(math.acos(cosine))
#
# # 判断是否为横向文本
# if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
# # line_text = ' '.join(span['text'] for span in line['spans'])
# # print('This line is horizontal:', line_text)
# horizontal_count += 1
# # 判断是否为纵向文本
# elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
# # line_text = ' '.join(span['text'] for span in line['spans'])
# # print('This line is vertical:', line_text)
# vertical_count += 1
# print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
# 判断每一页的文本布局
if
vertical_count
==
0
and
horizontal_count
==
0
:
# 该页没有文本,无法判断
text_layout_list
.
append
(
'unknow'
)
continue
else
:
if
vertical_count
>
horizontal_count
:
# 该页的文本纵向行数大于横向的
text_layout_list
.
append
(
'vertical'
)
else
:
# 该页的文本横向行数大于纵向的
text_layout_list
.
append
(
'horizontal'
)
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
return
text_layout_list
"""定义一个自定义异常用来抛出单页svg太多的pdf"""
class
PageSvgsTooManyError
(
Exception
):
def
__init__
(
self
,
message
=
'Page SVGs are too many'
):
self
.
message
=
message
super
().
__init__
(
self
.
message
)
def
get_svgs_per_page
(
doc
:
fitz
.
Document
):
svgs_len_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
# svgs = page.get_drawings()
svgs
=
page
.
get_cdrawings
()
# 切换成get_cdrawings,效率更高
len_svgs
=
len
(
svgs
)
if
len_svgs
>=
3000
:
raise
PageSvgsTooManyError
()
else
:
svgs_len_list
.
append
(
len_svgs
)
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
return
svgs_len_list
def
get_imgs_per_page
(
doc
:
fitz
.
Document
):
imgs_len_list
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
imgs
=
page
.
get_images
()
imgs_len_list
.
append
(
len
(
imgs
))
# logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
return
imgs_len_list
def
get_language
(
doc
:
fitz
.
Document
):
"""
获取PDF文档的语言。
Args:
doc (fitz.Document): PDF文档对象。
Returns:
str: 文档语言,如 "en-US"。
"""
language_lst
=
[]
for
page_id
,
page
in
enumerate
(
doc
):
if
page_id
>=
scan_max_page
:
break
# 拿所有text的str
text_block
=
page
.
get_text
(
'text'
)
page_language
=
detect_lang
(
text_block
)
language_lst
.
append
(
page_language
)
# logger.info(f"page_id: {page_id}, page_language: {page_language}")
# 统计text_language_list中每种语言的个数
count_dict
=
Counter
(
language_lst
)
# 输出text_language_list中出现的次数最多的语言
language
=
max
(
count_dict
,
key
=
count_dict
.
get
)
return
language
def
check_invalid_chars
(
pdf_bytes
):
"""乱码检测."""
# return detect_invalid_chars_by_pymupdf(pdf_bytes)
return
detect_invalid_chars
(
pdf_bytes
)
def
pdf_meta_scan
(
pdf_bytes
:
bytes
):
"""
:param s3_pdf_path:
:param pdf_bytes: pdf文件的二进制数据
几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取
"""
doc
=
fitz
.
open
(
'pdf'
,
pdf_bytes
)
is_needs_password
=
doc
.
needs_pass
is_encrypted
=
doc
.
is_encrypted
total_page
=
len
(
doc
)
if
total_page
==
0
:
logger
.
warning
(
f
'drop this pdf, drop_reason:
{
DropReason
.
EMPTY_PDF
}
'
)
result
=
{
'_need_drop'
:
True
,
'_drop_reason'
:
DropReason
.
EMPTY_PDF
}
return
result
else
:
page_width_pts
,
page_height_pts
=
get_pdf_page_size_pts
(
doc
)
# logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
# svgs_per_page = get_svgs_per_page(doc)
# logger.info(f"svgs_per_page: {svgs_per_page}")
imgs_per_page
=
get_imgs_per_page
(
doc
)
# logger.info(f"imgs_per_page: {imgs_per_page}")
image_info_per_page
,
junk_img_bojids
=
get_image_info
(
doc
,
page_width_pts
,
page_height_pts
)
# logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
text_len_per_page
=
get_pdf_textlen_per_page
(
doc
)
# logger.info(f"text_len_per_page: {text_len_per_page}")
# text_layout_per_page = get_pdf_text_layout_per_page(doc)
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
# text_language = get_language(doc)
# logger.info(f"text_language: {text_language}")
invalid_chars
=
check_invalid_chars
(
pdf_bytes
)
# logger.info(f"invalid_chars: {invalid_chars}")
# 最后输出一条json
res
=
{
'is_needs_password'
:
is_needs_password
,
'is_encrypted'
:
is_encrypted
,
'total_page'
:
total_page
,
'page_width_pts'
:
int
(
page_width_pts
),
'page_height_pts'
:
int
(
page_height_pts
),
'image_info_per_page'
:
image_info_per_page
,
'text_len_per_page'
:
text_len_per_page
,
# 'text_layout_per_page': text_layout_per_page,
# 'text_language': text_language,
# "svgs_per_page": svgs_per_page,
'imgs_per_page'
:
imgs_per_page
,
# 增加每页img数量list
'junk_img_bojids'
:
junk_img_bojids
,
# 增加垃圾图片的bojid list
'invalid_chars'
:
invalid_chars
,
'metadata'
:
doc
.
metadata
,
}
# logger.info(json.dumps(res, ensure_ascii=False))
return
res
if
__name__
==
'__main__'
:
pass
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
# "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
# file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") # noqa: E501
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
# doc = fitz.open("pdf", file_content)
# text_layout_lst = get_pdf_text_layout_per_page(doc)
# print(text_layout_lst)
magic_pdf/integrations/__init__.py
deleted
100644 → 0
View file @
ddf5a878
magic_pdf/integrations/rag/__init__.py
deleted
100644 → 0
View file @
ddf5a878
magic_pdf/integrations/rag/api.py
deleted
100644 → 0
View file @
ddf5a878
import
os
from
pathlib
import
Path
from
loguru
import
logger
from
magic_pdf.integrations.rag.type
import
(
ElementRelation
,
LayoutElements
,
Node
)
from
magic_pdf.integrations.rag.utils
import
inference
class
RagPageReader
:
def
__init__
(
self
,
pagedata
:
LayoutElements
):
self
.
o
=
[
Node
(
category_type
=
v
.
category_type
,
text
=
v
.
text
,
image_path
=
v
.
image_path
,
anno_id
=
v
.
anno_id
,
latex
=
v
.
latex
,
html
=
v
.
html
,
)
for
v
in
pagedata
.
layout_dets
]
self
.
pagedata
=
pagedata
def
__iter__
(
self
):
return
iter
(
self
.
o
)
def
get_rel_map
(
self
)
->
list
[
ElementRelation
]:
return
self
.
pagedata
.
extra
.
element_relation
class
RagDocumentReader
:
def
__init__
(
self
,
ragdata
:
list
[
LayoutElements
]):
self
.
o
=
[
RagPageReader
(
v
)
for
v
in
ragdata
]
def
__iter__
(
self
):
return
iter
(
self
.
o
)
class
DataReader
:
def
__init__
(
self
,
path_or_directory
:
str
,
method
:
str
,
output_dir
:
str
):
self
.
path_or_directory
=
path_or_directory
self
.
method
=
method
self
.
output_dir
=
output_dir
self
.
pdfs
=
[]
if
os
.
path
.
isdir
(
path_or_directory
):
for
doc_path
in
Path
(
path_or_directory
).
glob
(
'*.pdf'
):
self
.
pdfs
.
append
(
doc_path
)
else
:
assert
path_or_directory
.
endswith
(
'.pdf'
)
self
.
pdfs
.
append
(
Path
(
path_or_directory
))
def
get_documents_count
(
self
)
->
int
:
"""Returns the number of documents in the directory."""
return
len
(
self
.
pdfs
)
def
get_document_result
(
self
,
idx
:
int
)
->
RagDocumentReader
|
None
:
"""
Args:
idx (int): the index of documents under the
directory path_or_directory
Returns:
RagDocumentReader | None: RagDocumentReader is an iterable object,
more details @RagDocumentReader
"""
if
idx
>=
self
.
get_documents_count
()
or
idx
<
0
:
logger
.
error
(
f
'invalid idx:
{
idx
}
'
)
return
None
res
=
inference
(
str
(
self
.
pdfs
[
idx
]),
self
.
output_dir
,
self
.
method
)
if
res
is
None
:
logger
.
warning
(
f
'failed to inference pdf
{
self
.
pdfs
[
idx
]
}
'
)
return
None
return
RagDocumentReader
(
res
)
def
get_document_filename
(
self
,
idx
:
int
)
->
Path
:
"""get the filename of the document."""
return
self
.
pdfs
[
idx
]
Prev
1
2
3
4
5
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment