Commit 84fa04e2 authored by myhloli's avatar myhloli
Browse files

feat: enhance PDF image coverage analysis with improved parsing and coverage calculation

parent cfc78406
...@@ -5,8 +5,13 @@ import numpy as np ...@@ -5,8 +5,13 @@ import numpy as np
import pypdfium2 as pdfium import pypdfium2 as pdfium
from loguru import logger from loguru import logger
from pdfminer.high_level import extract_text from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFParser
from pypdf import PdfReader from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams, LTImage, LTFigure
from pdfminer.converter import PDFPageAggregator
def classify(pdf_bytes): def classify(pdf_bytes):
...@@ -41,7 +46,7 @@ def classify(pdf_bytes): ...@@ -41,7 +46,7 @@ def classify(pdf_bytes):
return 'ocr' return 'ocr'
else: else:
if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.9: if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8:
return 'ocr' return 'ocr'
return 'txt' return 'txt'
...@@ -77,60 +82,94 @@ def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check): ...@@ -77,60 +82,94 @@ def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check):
return avg_cleaned_chars_per_page return avg_cleaned_chars_per_page
def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check): def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check):
# 创建内存文件对象
pdf_stream = BytesIO(sample_pdf_bytes) pdf_stream = BytesIO(sample_pdf_bytes)
pdf_reader = PdfReader(pdf_stream)
# 创建PDF解析器
parser = PDFParser(pdf_stream)
# 创建PDF文档对象
document = PDFDocument(parser)
# 检查文档是否允许文本提取
if not document.is_extractable:
# logger.warning("PDF不允许内容提取")
return 1.0 # 默认为高覆盖率,因为无法提取内容
# 创建资源管理器和参数对象
rsrcmgr = PDFResourceManager()
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=None,
detect_vertical=False,
all_texts=False,
)
# 创建聚合器
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建解释器
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 记录高图像覆盖率的页面数量 # 记录高图像覆盖率的页面数量
high_image_coverage_pages = 0 high_image_coverage_pages = 0
page_count = 0
# 检查前几页的图像 # 遍历页面
for i in range(pages_to_check): for page in PDFPage.create_pages(document):
page = pdf_reader.pages[i] # 控制检查的页数
if page_count >= pages_to_check:
break
# 处理页面
interpreter.process_page(page)
layout = device.get_result()
# 获取页面尺寸 # 页面尺寸
page_width = float(page.mediabox.width) page_width = layout.width
page_height = float(page.mediabox.height) page_height = layout.height
page_area = page_width * page_height page_area = page_width * page_height
# 算图像覆盖 # 算图像覆盖的总面积
image_area = 0 image_area = 0
if '/Resources' in page:
resources = page['/Resources'] # 遍历页面元素
if '/XObject' in resources: for element in layout:
x_objects = resources['/XObject'] # 检查是否为图像或图形元素
# 计算所有图像对象占据的面积 if isinstance(element, (LTImage, LTFigure)):
for obj_name in x_objects: # 计算图像边界框面积
try: img_width = element.width
obj = x_objects[obj_name] img_height = element.height
if obj['/Subtype'] == '/Image': img_area = img_width * img_height
# 获取图像宽高 image_area += img_area
width = obj.get('/Width', 0)
height = obj.get('/Height', 0) # 计算覆盖率
coverage_ratio = min(image_area / page_area, 1.0) if page_area > 0 else 0
# 计算图像在页面上的估计面积 # logger.debug(f"PDF分析: 页面 {page_count + 1} 图像覆盖率: {coverage_ratio:.2f}")
# 注意:这是估计值,因为没有考虑图像变换矩阵
scale_factor = 1.0 # 估计缩放因子 # 判断是否为高覆盖率
img_area = width * height * scale_factor if coverage_ratio >= 0.8: # 使用80%作为高覆盖率的阈值
image_area += img_area
except Exception as e:
# logger.debug(f"处理图像对象时出错: {e}")
continue
# 估算图像覆盖率
estimated_coverage = min(image_area / page_area, 1.0) if page_area > 0 else 0
# logger.debug(f"PDF分析: 页面 {i + 1} 图像覆盖率: {estimated_coverage:.2f}")
# 基于估计的图像覆盖率
if estimated_coverage >= 1:
# 如果图像覆盖率超过80%,认为是高图像覆盖率页面
high_image_coverage_pages += 1 high_image_coverage_pages += 1
# 计算高图像覆盖页面比例
high_image_coverage_ratio = high_image_coverage_pages / pages_to_check
# logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_image_coverage_ratio:.2f}")
pdf_stream.close() # 关闭字节流 page_count += 1
pdf_reader.close()
return high_image_coverage_ratio # 如果没有处理任何页面,返回0
if page_count == 0:
return 0.0
# 计算高图像覆盖率的页面比例
high_coverage_ratio = high_image_coverage_pages / page_count
# logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_coverage_ratio:.2f}")
# 关闭资源
pdf_stream.close()
return high_coverage_ratio
def extract_pages(src_pdf_bytes: bytes) -> bytes: def extract_pages(src_pdf_bytes: bytes) -> bytes:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment