pdf_check.py 3.1 KB
Newer Older
1
2
3
import fitz
import numpy as np
from loguru import logger
4
5
6
# import re
# from io import BytesIO
# from pdfminer.high_level import extract_text
7
8


9
def calculate_sample_count(total_page: int):
10
11
12
    """
    根据总页数和采样率计算采样页面的数量。
    """
13
    select_page_cnt = min(10, total_page)
14
15
16
    return select_page_cnt


17
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
    pdf_docs = fitz.open("pdf", src_pdf_bytes)
    total_page = len(pdf_docs)
    if total_page == 0:
        # 如果PDF没有页面,直接返回空文档
        logger.warning("PDF is empty, return empty document")
        return fitz.Document()
    select_page_cnt = calculate_sample_count(total_page)

    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
    sample_docs = fitz.Document()
    try:
        for index in page_num:
            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
    except Exception as e:
        logger.exception(e)
    return sample_docs


36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
#     """"
#     检测PDF中是否包含非法字符
#     """
#     '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
#     sample_docs = extract_pages(src_pdf_bytes)
#     sample_pdf_bytes = sample_docs.tobytes()
#     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
#     text = extract_text(sample_pdf_file_like_object)
#     text = text.replace("\n", "")
#     # logger.info(text)
#     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
#     cid_pattern = re.compile(r'\(cid:\d+\)')
#     matches = cid_pattern.findall(text)
#     cid_count = len(matches)
#     cid_len = sum(len(match) for match in matches)
#     text_len = len(text)
#     if text_len == 0:
#         cid_chars_radio = 0
#     else:
#         cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
#     logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
#     '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
#     if cid_chars_radio > 0.05:
#         return False  # 乱码文档
#     else:
#         return True   # 正常文档


def count_replacement_characters(text: str) -> int:
    """
    统计字符串中 0xfffd 字符的数量。
68
    """
69
70
71
72
    return text.count('\ufffd')


def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
73
    sample_docs = extract_pages(src_pdf_bytes)
74
75
76
77
78
79
    doc_text = ""
    for page in sample_docs:
        page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
        doc_text += page_text
    text_len = len(doc_text)
    uffd_count = count_replacement_characters(doc_text)
80
    if text_len == 0:
81
        uffd_chars_radio = 0
82
    else:
83
84
85
86
        uffd_chars_radio = uffd_count / text_len
    logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
    '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
    if uffd_chars_radio > 0.01:
87
88
        return False  # 乱码文档
    else:
89
        return True   # 正常文档