"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "3f6ff20b9b01478edeadedada1d1db0375111616"
Commit ac888156 authored by myhloli's avatar myhloli
Browse files

refactor(pdf_check): improve character detection using PyMuPDF

- Replace pdfminer with PyMuPDF for character detection
- Implement new method detect_invalid_chars_by_pymupdf
- Update check_invalid_chars in pdf_meta_scan.py to use new method
- Add __replace_0xfffd function in pdf_parse_union_core_v2.py to handle special characters
- Remove unused imports and update requirements.txt
parent 88c0854a
...@@ -8,7 +8,7 @@ from loguru import logger ...@@ -8,7 +8,7 @@ from loguru import logger
from magic_pdf.config.drop_reason import DropReason from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import get_top_percent_list, mymax from magic_pdf.libs.commons import get_top_percent_list, mymax
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
scan_max_page = 50 scan_max_page = 50
junk_limit_min = 10 junk_limit_min = 10
...@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document): ...@@ -323,7 +323,7 @@ def get_language(doc: fitz.Document):
def check_invalid_chars(pdf_bytes): def check_invalid_chars(pdf_bytes):
"""乱码检测.""" """乱码检测."""
return detect_invalid_chars(pdf_bytes) return detect_invalid_chars_by_pymupdf(pdf_bytes)
def pdf_meta_scan(pdf_bytes: bytes): def pdf_meta_scan(pdf_bytes: bytes):
......
from io import BytesIO
import re
import fitz import fitz
import numpy as np import numpy as np
from loguru import logger from loguru import logger
from pdfminer.high_level import extract_text # import re
# from io import BytesIO
# from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int): def calculate_sample_count(total_page: int):
...@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int): ...@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
return select_page_cnt return select_page_cnt
def extract_pages(src_pdf_bytes: bytes): def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
pdf_docs = fitz.open("pdf", src_pdf_bytes) pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs) total_page = len(pdf_docs)
if total_page == 0: if total_page == 0:
...@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes): ...@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
return sample_docs return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: # def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
"""" # """"
检测PDF中是否包含非法字符 # 检测PDF中是否包含非法字符
# """
# '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
# sample_docs = extract_pages(src_pdf_bytes)
# sample_pdf_bytes = sample_docs.tobytes()
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
# text = extract_text(sample_pdf_file_like_object)
# text = text.replace("\n", "")
# # logger.info(text)
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
# cid_pattern = re.compile(r'\(cid:\d+\)')
# matches = cid_pattern.findall(text)
# cid_count = len(matches)
# cid_len = sum(len(match) for match in matches)
# text_len = len(text)
# if text_len == 0:
# cid_chars_radio = 0
# else:
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
# if cid_chars_radio > 0.05:
# return False # 乱码文档
# else:
# return True # 正常文档
def count_replacement_characters(text: str) -> int:
"""
统计字符串中 0xfffd 字符的数量。
""" """
'''pdfminer比较慢,需要先随机抽取10页左右的sample''' return text.count('\ufffd')
def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
sample_docs = extract_pages(src_pdf_bytes) sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes() doc_text = ""
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) for page in sample_docs:
text = extract_text(sample_pdf_file_like_object) page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
text = text.replace("\n", "") doc_text += page_text
# logger.info(text) text_len = len(doc_text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' uffd_count = count_replacement_characters(doc_text)
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0: if text_len == 0:
cid_chars_radio = 0 uffd_chars_radio = 0
else: else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len) uffd_chars_radio = uffd_count / text_len
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05: if uffd_chars_radio > 0.01:
return False # 乱码文档 return False # 乱码文档
else: else:
return True # 正常文档 return True # 正常文档
\ No newline at end of file
...@@ -57,6 +57,13 @@ def __replace_STX_ETX(text_str: str): ...@@ -57,6 +57,13 @@ def __replace_STX_ETX(text_str: str):
return text_str return text_str
def __replace_0xfffd(text_str: str):
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
if text_str:
s = text_str.replace('\ufffd', " ")
return s
return text_str
def chars_to_content(span): def chars_to_content(span):
# 检查span中的char是否为空 # 检查span中的char是否为空
if len(span['chars']) == 0: if len(span['chars']) == 0:
...@@ -76,7 +83,8 @@ def chars_to_content(span): ...@@ -76,7 +83,8 @@ def chars_to_content(span):
if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width: if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
content += ' ' content += ' '
content += char['c'] content += char['c']
span['content'] = __replace_STX_ETX(content)
span['content'] = __replace_0xfffd(content)
del span['chars'] del span['chars']
...@@ -140,7 +148,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag): ...@@ -140,7 +148,7 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE)['blocks'] text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
all_pymu_chars = [] all_pymu_chars = []
for block in text_blocks_raw: for block in text_blocks_raw:
......
...@@ -4,10 +4,10 @@ click>=8.1.7 ...@@ -4,10 +4,10 @@ click>=8.1.7
fast-langdetect==0.2.0 fast-langdetect==0.2.0
loguru>=0.6.0 loguru>=0.6.0
numpy>=1.21.6,<2.0.0 numpy>=1.21.6,<2.0.0
pdfminer.six==20231228
pydantic>=2.7.2,<2.8.0 pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9 PyMuPDF>=1.24.9
scikit-learn>=1.0.2 scikit-learn>=1.0.2
torch>=2.2.2,<=2.3.1 torch>=2.2.2,<=2.3.1
transformers transformers
# pdfminer.six==20231228
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment