Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
84fa04e2
Commit
84fa04e2
authored
Jun 12, 2025
by
myhloli
Browse files
feat: enhance PDF image coverage analysis with improved parsing and coverage calculation
parent
cfc78406
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
84 additions
and
45 deletions
+84
-45
mineru/utils/pdf_classify.py
mineru/utils/pdf_classify.py
+84
-45
No files found.
mineru/utils/pdf_classify.py
View file @
84fa04e2
...
@@ -5,8 +5,13 @@ import numpy as np
...
@@ -5,8 +5,13 @@ import numpy as np
import
pypdfium2
as
pdfium
import
pypdfium2
as
pdfium
from
loguru
import
logger
from
loguru
import
logger
from
pdfminer.high_level
import
extract_text
from
pdfminer.high_level
import
extract_text
from
pdfminer.layout
import
LAParams
from
pdfminer.pdfparser
import
PDFParser
from
pypdf
import
PdfReader
from
pdfminer.pdfdocument
import
PDFDocument
from
pdfminer.pdfpage
import
PDFPage
from
pdfminer.pdfinterp
import
PDFResourceManager
from
pdfminer.pdfinterp
import
PDFPageInterpreter
from
pdfminer.layout
import
LAParams
,
LTImage
,
LTFigure
from
pdfminer.converter
import
PDFPageAggregator
def
classify
(
pdf_bytes
):
def
classify
(
pdf_bytes
):
...
@@ -41,7 +46,7 @@ def classify(pdf_bytes):
...
@@ -41,7 +46,7 @@ def classify(pdf_bytes):
return
'ocr'
return
'ocr'
else
:
else
:
if
get_high_image_coverage_ratio
(
sample_pdf_bytes
,
pages_to_check
)
>=
0.
9
:
if
get_high_image_coverage_ratio
(
sample_pdf_bytes
,
pages_to_check
)
>=
0.
8
:
return
'ocr'
return
'ocr'
return
'txt'
return
'txt'
...
@@ -77,60 +82,94 @@ def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
...
@@ -77,60 +82,94 @@ def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
return
avg_cleaned_chars_per_page
return
avg_cleaned_chars_per_page
def
get_high_image_coverage_ratio
(
sample_pdf_bytes
,
pages_to_check
):
def
get_high_image_coverage_ratio
(
sample_pdf_bytes
,
pages_to_check
):
# 创建内存文件对象
pdf_stream
=
BytesIO
(
sample_pdf_bytes
)
pdf_stream
=
BytesIO
(
sample_pdf_bytes
)
pdf_reader
=
PdfReader
(
pdf_stream
)
# 创建PDF解析器
parser
=
PDFParser
(
pdf_stream
)
# 创建PDF文档对象
document
=
PDFDocument
(
parser
)
# 检查文档是否允许文本提取
if
not
document
.
is_extractable
:
# logger.warning("PDF不允许内容提取")
return
1.0
# 默认为高覆盖率,因为无法提取内容
# 创建资源管理器和参数对象
rsrcmgr
=
PDFResourceManager
()
laparams
=
LAParams
(
line_overlap
=
0.5
,
char_margin
=
2.0
,
line_margin
=
0.5
,
word_margin
=
0.1
,
boxes_flow
=
None
,
detect_vertical
=
False
,
all_texts
=
False
,
)
# 创建聚合器
device
=
PDFPageAggregator
(
rsrcmgr
,
laparams
=
laparams
)
# 创建解释器
interpreter
=
PDFPageInterpreter
(
rsrcmgr
,
device
)
# 记录高图像覆盖率的页面数量
# 记录高图像覆盖率的页面数量
high_image_coverage_pages
=
0
high_image_coverage_pages
=
0
page_count
=
0
# 检查前几页的图像
# 遍历页面
for
i
in
range
(
pages_to_check
):
for
page
in
PDFPage
.
create_pages
(
document
):
page
=
pdf_reader
.
pages
[
i
]
# 控制检查的页数
if
page_count
>=
pages_to_check
:
break
# 处理页面
interpreter
.
process_page
(
page
)
layout
=
device
.
get_result
()
#
获取
页面尺寸
# 页面尺寸
page_width
=
float
(
page
.
mediabox
.
width
)
page_width
=
layout
.
width
page_height
=
float
(
page
.
mediabox
.
height
)
page_height
=
layout
.
height
page_area
=
page_width
*
page_height
page_area
=
page_width
*
page_height
#
估
算图像覆盖
率
#
计
算图像覆盖
的总面积
image_area
=
0
image_area
=
0
if
'/Resources'
in
page
:
resources
=
page
[
'/Resources'
]
# 遍历页面元素
if
'/XObject'
in
resources
:
for
element
in
layout
:
x_objects
=
resources
[
'/XObject'
]
# 检查是否为图像或图形元素
# 计算所有图像对象占据的面积
if
isinstance
(
element
,
(
LTImage
,
LTFigure
)):
for
obj_name
in
x_objects
:
# 计算图像边界框面积
try
:
img_width
=
element
.
width
obj
=
x_objects
[
obj_name
]
img_height
=
element
.
height
if
obj
[
'/Subtype'
]
==
'/Image'
:
img_area
=
img_width
*
img_height
# 获取图像宽高
width
=
obj
.
get
(
'/Width'
,
0
)
height
=
obj
.
get
(
'/Height'
,
0
)
# 计算图像在页面上的估计面积
# 注意:这是估计值,因为没有考虑图像变换矩阵
scale_factor
=
1.0
# 估计缩放因子
img_area
=
width
*
height
*
scale_factor
image_area
+=
img_area
image_area
+=
img_area
except
Exception
as
e
:
# logger.debug(f"处理图像对象时出错: {e}")
# 计算覆盖率
continue
coverage_ratio
=
min
(
image_area
/
page_area
,
1.0
)
if
page_area
>
0
else
0
# logger.debug(f"PDF分析: 页面 {page_count + 1} 图像覆盖率: {coverage_ratio:.2f}")
# 估算图像覆盖率
estimated_coverage
=
min
(
image_area
/
page_area
,
1.0
)
if
page_area
>
0
else
0
# 判断是否为高覆盖率
# logger.debug(f"PDF分析: 页面 {i + 1} 图像覆盖率: {estimated_coverage:.2f}")
if
coverage_ratio
>=
0.8
:
# 使用80%作为高覆盖率的阈值
# 基于估计的图像覆盖率
if
estimated_coverage
>=
1
:
# 如果图像覆盖率超过80%,认为是高图像覆盖率页面
high_image_coverage_pages
+=
1
high_image_coverage_pages
+=
1
# 计算高图像覆盖页面比例
high_image_coverage_ratio
=
high_image_coverage_pages
/
pages_to_check
# logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_image_coverage_ratio:.2f}")
pdf_stream
.
close
()
# 关闭字节流
page_count
+=
1
pdf_reader
.
close
()
return
high_image_coverage_ratio
# 如果没有处理任何页面,返回0
if
page_count
==
0
:
return
0.0
# 计算高图像覆盖率的页面比例
high_coverage_ratio
=
high_image_coverage_pages
/
page_count
# logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_coverage_ratio:.2f}")
# 关闭资源
pdf_stream
.
close
()
return
high_coverage_ratio
def
extract_pages
(
src_pdf_bytes
:
bytes
)
->
bytes
:
def
extract_pages
(
src_pdf_bytes
:
bytes
)
->
bytes
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment