Commit e1be7da6 authored by myhloli's avatar myhloli
Browse files

refactor(magic_pdf): switch to pdfminer for invalid character detection

- Replace MuPDF with pdfminer for detecting invalid characters in PDFs
- Uncomment and update the detect_invalid_chars function to use pdfminer
- Update the check_invalid_chars function in pdf_meta_scan.py to use the new implementation
parent 01cd633d
...@@ -8,7 +8,7 @@ from loguru import logger ...@@ -8,7 +8,7 @@ from loguru import logger
from magic_pdf.config.drop_reason import DropReason from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import get_top_percent_list, mymax from magic_pdf.libs.commons import get_top_percent_list, mymax
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
scan_max_page = 50 scan_max_page = 50
junk_limit_min = 10 junk_limit_min = 10
...@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document): ...@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
def check_invalid_chars(pdf_bytes): def check_invalid_chars(pdf_bytes):
"""乱码检测.""" """乱码检测."""
return detect_invalid_chars_by_pymupdf(pdf_bytes) # return detect_invalid_chars_by_pymupdf(pdf_bytes)
return detect_invalid_chars(pdf_bytes)
def pdf_meta_scan(pdf_bytes: bytes): def pdf_meta_scan(pdf_bytes: bytes):
......
import fitz import fitz
import numpy as np import numpy as np
from loguru import logger from loguru import logger
# import re import re
# from io import BytesIO from io import BytesIO
# from pdfminer.high_level import extract_text from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int): def calculate_sample_count(total_page: int):
...@@ -33,33 +33,33 @@ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document: ...@@ -33,33 +33,33 @@ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
return sample_docs return sample_docs
# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
# """" """"
# 检测PDF中是否包含非法字符 检测PDF中是否包含非法字符
# """ """
# '''pdfminer比较慢,需要先随机抽取10页左右的sample''' '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
# sample_docs = extract_pages(src_pdf_bytes) sample_docs = extract_pages(src_pdf_bytes)
# sample_pdf_bytes = sample_docs.tobytes() sample_pdf_bytes = sample_docs.tobytes()
# sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
# text = extract_text(sample_pdf_file_like_object) text = extract_text(sample_pdf_file_like_object)
# text = text.replace("\n", "") text = text.replace("\n", "")
# # logger.info(text) # logger.info(text)
# '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
# cid_pattern = re.compile(r'\(cid:\d+\)') cid_pattern = re.compile(r'\(cid:\d+\)')
# matches = cid_pattern.findall(text) matches = cid_pattern.findall(text)
# cid_count = len(matches) cid_count = len(matches)
# cid_len = sum(len(match) for match in matches) cid_len = sum(len(match) for match in matches)
# text_len = len(text) text_len = len(text)
# if text_len == 0: if text_len == 0:
# cid_chars_radio = 0 cid_chars_radio = 0
# else: else:
# cid_chars_radio = cid_count/(cid_count + text_len - cid_len) cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
# logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
# '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
# if cid_chars_radio > 0.05: if cid_chars_radio > 0.05:
# return False # 乱码文档 return False # 乱码文档
# else: else:
# return True # 正常文档 return True # 正常文档
def count_replacement_characters(text: str) -> int: def count_replacement_characters(text: str) -> int:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment