test_meta_scan.py 5.38 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
import os

import pytest
赵小蒙's avatar
赵小蒙 committed
4
from pdf_tools.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language
赵小蒙's avatar
赵小蒙 committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from test.test_commons import get_docs_from_test_pdf, get_test_json_data

# 获取当前目录
current_directory = os.path.dirname(os.path.abspath(__file__))

'''
获取pdf的宽与高,宽和高各用一个list,分别取中位数
'''
@pytest.mark.parametrize("book_name, expected_width, expected_height",
                         [
                             ("zlib/zlib_17058115", 795, 1002),  # pdf中最大页与最小页差异极大个例
                             ("the_eye/the_eye_wtl_00023799", 616, 785)  # 采样的前50页存在中位数大小页面横竖旋转情况
                         ])
def test_get_pdf_page_size_pts(book_name, expected_width, expected_height):
    docs = get_docs_from_test_pdf(book_name)
    median_width, median_height = get_pdf_page_size_pts(docs)

    assert int(median_width) == expected_width
    assert int(median_height) == expected_height


'''
获取pdf前50页的图片信息,为了提速,对特殊扫描版1的情况做了过滤,其余情况都正常取图片信息
'''
@pytest.mark.parametrize("book_name",
                         [
                            "zlib/zlib_21370453",  # 特殊扫描版1,每页都有所有扫描页图片,特点是图占比大,每页展示1至n张
                            "the_eye/the_eye_cdn_00391653",  # 特殊文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张,这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
                            "scihub/scihub_08400000/libgen.scimag08489000-08489999.zip_10.1016/0370-1573(90)90070-i",  # 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
                            "zlib/zlib_17216416",  # 特殊扫描版3,有的页面是一整张大图,有的页面是通过一条条小图拼起来的
                            "the_eye/the_eye_wtl_00023799",  # 特殊扫描版4,每一页都是一张张小图拼出来的
                            "the_eye/the_eye_cdn_00328381",  # 特殊扫描版5,每一页都是一张张小图拼出来的,但是存在多个小图多次重复使用情况
                            "scihub/scihub_25800000/libgen.scimag25889000-25889999.zip_10.2307/4153991",  # 特殊扫描版6,只有3页且其中两页是扫描页
                            "scanned_detection/llm-raw-scihub-o.O-0584-8539%2891%2980165-f",  # 特殊扫描版7,只有一页,且是一张张小图拼出来的
                            "scanned_detection/llm-raw-scihub-o.O-bf01427123",  # 特殊扫描版8,只有3页且全是大图扫描版
                            "zlib/zlib_22115997",  # 特殊扫描版9,类似特1,但是每页数量不完全相等
                            "zlib/zlib_21814957",  # 特殊扫描版10,类似特1,但是每页数量不完全相等
                            "zlib/zlib_21814955",  # 特殊扫描版11,类似特1,但是每页数量不完全相等
                            "scihub/scihub_41200000/libgen.scimag41253000-41253999.zip_10.1080/00222938709460256",  # 特殊扫描版12,头两页文字版且有一页没图片,后面扫描版11页
                            "scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622"  # 特殊扫描版13,头两页文字版且有一页没图片,后面扫描版3页
                         ])
def test_get_image_info(book_name):
    test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
    docs = get_docs_from_test_pdf(book_name)
    page_width_pts, page_height_pts = get_pdf_page_size_pts(docs)
    image_info, junk_img_bojids = get_image_info(docs, page_width_pts, page_height_pts)

    assert image_info == test_data[book_name]["expected_image_info"]
    assert junk_img_bojids == test_data[book_name]["expected_junk_img_bojids"]


'''
获取pdf前50页的文本布局信息,输出list,每个元素为一个页面的横竖排信息
'''
@pytest.mark.parametrize("book_name",
                         [
                            "vertical_detection/三国演义_繁体竖排版",  # 竖排版本1
                            "vertical_detection/净空法师_大乘无量寿",  # 竖排版本2
                            "vertical_detection/om3006239",  # 横排版本1
                            "vertical_detection/isit.2006.261791"  # 横排版本2
                         ])
def test_get_text_layout_info(book_name):
    test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")

    docs = get_docs_from_test_pdf(book_name)
    text_layout_info = get_pdf_text_layout_per_page(docs)
    assert text_layout_info == test_data[book_name]["expected_text_layout"]


'''
获取pdf的语言信息
'''
@pytest.mark.parametrize("book_name, expected_language",
                         [
                             ("scihub/scihub_05000000/libgen.scimag05023000-05023999.zip_10.1034/j.1601-0825.2003.02933.x", "en"),  # 英文论文
                         ])
def test_get_text_language_info(book_name, expected_language):
    docs = get_docs_from_test_pdf(book_name)
    text_language = get_language(docs)
    assert text_language == expected_language