"vscode:/vscode.git/clone" did not exist on "58bfcc9ca4b5cb5334f10d63a28638bd5a714b12"
pdf_check.py 3.32 KB
Newer Older
1
2
3
import fitz
import numpy as np
from loguru import logger
4
5
6
import re
from io import BytesIO
from pdfminer.high_level import extract_text
7
from pdfminer.layout import LAParams
8
9


10
def calculate_sample_count(total_page: int):
11
12
13
    """
    根据总页数和采样率计算采样页面的数量。
    """
14
    select_page_cnt = min(10, total_page)
15
16
17
    return select_page_cnt


18
def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
    pdf_docs = fitz.open("pdf", src_pdf_bytes)
    total_page = len(pdf_docs)
    if total_page == 0:
        # 如果PDF没有页面,直接返回空文档
        logger.warning("PDF is empty, return empty document")
        return fitz.Document()
    select_page_cnt = calculate_sample_count(total_page)

    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
    sample_docs = fitz.Document()
    try:
        for index in page_num:
            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
    except Exception as e:
        logger.exception(e)
    return sample_docs


37
38
39
40
41
42
43
44
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
    """"
    检测PDF中是否包含非法字符
    """
    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
    sample_docs = extract_pages(src_pdf_bytes)
    sample_pdf_bytes = sample_docs.tobytes()
    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
45
46
47
48
49
50
51
52
53
54
    laparams = LAParams(
        line_overlap=0.5,
        char_margin=2.0,
        line_margin=0.5,
        word_margin=0.1,
        boxes_flow=None,
        detect_vertical=False,
        all_texts=False,
    )
    text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
    text = text.replace("\n", "")
    # logger.info(text)
    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
    cid_pattern = re.compile(r'\(cid:\d+\)')
    matches = cid_pattern.findall(text)
    cid_count = len(matches)
    cid_len = sum(len(match) for match in matches)
    text_len = len(text)
    if text_len == 0:
        cid_chars_radio = 0
    else:
        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
    if cid_chars_radio > 0.05:
        return False  # 乱码文档
    else:
        return True   # 正常文档
73
74
75
76
77


def count_replacement_characters(text: str) -> int:
    """
    统计字符串中 0xfffd 字符的数量。
78
    """
79
80
81
82
    return text.count('\ufffd')


def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
83
    sample_docs = extract_pages(src_pdf_bytes)
84
85
86
87
88
89
    doc_text = ""
    for page in sample_docs:
        page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
        doc_text += page_text
    text_len = len(doc_text)
    uffd_count = count_replacement_characters(doc_text)
90
    if text_len == 0:
91
        uffd_chars_radio = 0
92
    else:
93
94
95
96
        uffd_chars_radio = uffd_count / text_len
    logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
    '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
    if uffd_chars_radio > 0.01:
97
98
        return False  # 乱码文档
    else:
99
        return True   # 正常文档