Commit 9f18ca20 authored by myhloli's avatar myhloli
Browse files

feat(pdf_parse): improve OCR processing and contrast filtering

- Rename empty_spans to need_ocr_spans for better clarity
- Add calculate_contrast function to measure image contrast
- Filter out low-contrast spans to improve OCR accuracy
- Update OCR processing workflow to use new filtering method
parent 5aa809ff
......@@ -6,8 +6,10 @@ import statistics
import time
from typing import List
import cv2
import fitz
import torch
import numpy as np
from loguru import logger
from magic_pdf.config.enums import SupportedPdfParseMethod
......@@ -127,17 +129,15 @@ def fill_char_in_spans(spans, all_chars):
span['chars'].append(char)
break
empty_spans = []
need_ocr_spans = []
for span in spans:
chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
empty_spans.append(span)
need_ocr_spans.append(span)
del span['height'], span['width']
return empty_spans
return need_ocr_spans
# 使用鲁棒性更强的中心点坐标判断
def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
......@@ -190,6 +190,28 @@ def remove_tilted_line(text_blocks):
block['lines'].remove(line)
def calculate_contrast(img) -> float:
"""
计算给定BGR图像的对比度。
:param img: BGR格式的图像,类型为numpy.ndarray
:return: 图像的对比度值
"""
# 将BGR图像转换为灰度图
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 计算均值和标准差
mean_value = np.mean(gray_img)
std_dev = np.std(gray_img)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast = std_dev / (mean_value + 1e-6)
# logger.info(f"contrast: {contrast}")
return round(contrast, 2)
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
# cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
......@@ -274,9 +296,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
span['chars'] = []
new_spans.append(span)
empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
if len(empty_spans) > 0:
if len(need_ocr_spans) > 0:
# 初始化ocr模型
atom_model_manager = AtomModelSingleton()
......@@ -287,9 +309,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
lang=lang
)
for span in empty_spans:
for span in need_ocr_spans:
# 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
# 计算span的对比度,低于0.20的span不进行ocr
if calculate_contrast(span_img) <= 0.20:
spans.remove(span)
continue
ocr_res = ocr_model.ocr(span_img, det=False)
if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment