Unverified Commit 7b197fe2 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #998 from myhloli/dev

test(unitest): Restore unit test cases
parents 8e981b3a 90cf1082
import os
import pytest
from magic_pdf.filter.pdf_classify_by_type import classify_by_area, classify_by_text_len, classify_by_avg_words, \
classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
from tests.test_commons import get_docs_from_test_pdf, get_test_json_data
# 获取当前目录
current_directory = os.path.dirname(os.path.abspath(__file__))
'''
根据图片尺寸占页面面积的比例,判断是否为扫描版
'''
@pytest.mark.parametrize("book_name, expected_bool_classify_by_area",
[
("the_eye/the_eye_cdn_00391653", True), # 特殊文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张
("scihub/scihub_08400000/libgen.scimag08489000-08489999.zip_10.1016/0370-1573(90)90070-i", False), # 特殊扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张
("zlib/zlib_17216416", False), # 特殊扫描版3,有的页面是一整张大图,有的页面是通过一条条小图拼起来的,检测图片占比之前需要先按规则把小图拼成大图
("the_eye/the_eye_wtl_00023799", False), # 特殊扫描版4,每一页都是一张张小图拼出来的,检测图片占比之前需要先按规则把小图拼成大图
("the_eye/the_eye_cdn_00328381", False), # 特殊扫描版5,每一页都是一张张小图拼出来的,存在多个小图多次重复使用情况,检测图片占比之前需要先按规则把小图拼成大图
("scihub/scihub_25800000/libgen.scimag25889000-25889999.zip_10.2307/4153991", False), # 特殊扫描版6,只有三页,其中两页是扫描版
("scanned_detection/llm-raw-scihub-o.O-0584-8539%2891%2980165-f", False), # 特殊扫描版7,只有一页且由小图拼成大图
("scanned_detection/llm-raw-scihub-o.O-bf01427123", False), # 特殊扫描版8,只有3页且全是大图扫描版
("scihub/scihub_41200000/libgen.scimag41253000-41253999.zip_10.1080/00222938709460256", False), # 特殊扫描版12,头两页文字版且有一页没图片,后面扫描版11页
("scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622", False) # 特殊扫描版13,头两页文字版且有一页没图片,后面扫描版3页
])
def test_classify_by_area(book_name, expected_bool_classify_by_area):
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
docs = get_docs_from_test_pdf(book_name)
median_width, median_height = get_pdf_page_size_pts(docs)
page_width = int(median_width)
page_height = int(median_height)
img_sz_list = test_data[book_name]["expected_image_info"]
total_page = len(docs)
text_len_list = get_pdf_textlen_per_page(docs)
bool_classify_by_area = classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list)
assert bool_classify_by_area == expected_bool_classify_by_area
'''
广义上的文字版检测,任何一页大于100字,都认为为文字版
'''
@pytest.mark.parametrize("book_name, expected_bool_classify_by_text_len",
[
("scihub/scihub_67200000/libgen.scimag67237000-67237999.zip_10.1515/crpm-2017-0020", True), # 文字版,少于50页
("scihub/scihub_83300000/libgen.scimag83306000-83306999.zip_10.1007/978-3-658-30153-8", True), # 文字版,多于50页
("zhongwenzaixian/zhongwenzaixian_65771414", False), # 完全无字的宣传册
])
def test_classify_by_text_len(book_name, expected_bool_classify_by_text_len):
docs = get_docs_from_test_pdf(book_name)
text_len_list = get_pdf_textlen_per_page(docs)
total_page = len(docs)
bool_classify_by_text_len = classify_by_text_len(text_len_list, total_page)
assert bool_classify_by_text_len == expected_bool_classify_by_text_len
'''
狭义上的文字版检测,需要平均每页字数大于200字
'''
@pytest.mark.parametrize("book_name, expected_bool_classify_by_avg_words",
[
("zlib/zlib_21207669", False), # 扫描版,书末尾几页有大纲文字
("zlib/zlib_19012845", False), # 扫描版,好几本扫描书的集合,每本书末尾有一页文字页
("scihub/scihub_67200000/libgen.scimag67237000-67237999.zip_10.1515/crpm-2017-0020", True),# 正常文字版
("zhongwenzaixian/zhongwenzaixian_65771414", False), # 宣传册
("zhongwenzaixian/zhongwenzaixian_351879", False), # 图解书/无字or少字
("zhongwenzaixian/zhongwenzaixian_61357496_pdfvector", False), # 书法集
("zhongwenzaixian/zhongwenzaixian_63684541", False), # 设计图
("zhongwenzaixian/zhongwenzaixian_61525978", False), # 绘本
("zhongwenzaixian/zhongwenzaixian_63679729", False), # 摄影集
])
def test_classify_by_avg_words(book_name, expected_bool_classify_by_avg_words):
docs = get_docs_from_test_pdf(book_name)
text_len_list = get_pdf_textlen_per_page(docs)
bool_classify_by_avg_words = classify_by_avg_words(text_len_list)
assert bool_classify_by_avg_words == expected_bool_classify_by_avg_words
'''
这个规则只针对特殊扫描版1,因为扫描版1的图片信息都由于junk_list的原因被舍弃了,只能通过图片数量来判断
'''
@pytest.mark.parametrize("book_name, expected_bool_classify_by_img_num",
[
("zlib/zlib_21370453", False), # 特殊扫描版1,每页都有所有扫描页图片,特点是图占比大,每页展示1至n张
("zlib/zlib_22115997", False), # 特殊扫描版2,类似特1,但是每页数量不完全相等
("zlib/zlib_21814957", False), # 特殊扫描版3,类似特1,但是每页数量不完全相等
("zlib/zlib_21814955", False), # 特殊扫描版4,类似特1,但是每页数量不完全相等
])
def test_classify_by_img_num(book_name, expected_bool_classify_by_img_num):
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
docs = get_docs_from_test_pdf(book_name)
img_num_list = get_imgs_per_page(docs)
img_sz_list = test_data[book_name]["expected_image_info"]
bool_classify_by_img_num = classify_by_img_num(img_sz_list, img_num_list)
assert bool_classify_by_img_num == expected_bool_classify_by_img_num
'''
排除纵向排版的pdf
'''
@pytest.mark.parametrize("book_name, expected_bool_classify_by_text_layout",
[
("vertical_detection/三国演义_繁体竖排版", False), # 竖排版本1
("vertical_detection/净空法师_大乘无量寿", False), # 竖排版本2
("vertical_detection/om3006239", True), # 横排版本1
("vertical_detection/isit.2006.261791", True), # 横排版本2
])
def test_classify_by_text_layout(book_name, expected_bool_classify_by_text_layout):
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
text_layout_per_page = test_data[book_name]["expected_text_layout"]
bool_classify_by_text_layout = classify_by_text_layout(text_layout_per_page)
assert bool_classify_by_text_layout == expected_bool_classify_by_text_layout
'''
通过检测页面是否由多个窄长条图像组成,来过滤特殊的扫描版
这个规则只对窄长条组成的pdf进行识别,而不会识别常规的大图扫描pdf
'''
@pytest.mark.parametrize("book_name, expected_bool_classify_by_img_narrow_strips",
[
("scihub/scihub_25900000/libgen.scimag25991000-25991999.zip_10.2307/40066695", False), # 特殊扫描版
("the_eye/the_eye_wtl_00023799", False), # 特殊扫描版4,每一页都是一张张小图拼出来的,检测图片占比之前需要先按规则把小图拼成大图
("the_eye/the_eye_cdn_00328381", False), # 特殊扫描版5,每一页都是一张张小图拼出来的,存在多个小图多次重复使用情况,检测图片占比之前需要先按规则把小图拼成大图
("scanned_detection/llm-raw-scihub-o.O-0584-8539%2891%2980165-f", False), # 特殊扫描版7,只有一页且由小图拼成大图
("scihub/scihub_25800000/libgen.scimag25889000-25889999.zip_10.2307/4153991", True), # 特殊扫描版6,只有三页,其中两页是扫描版
("scanned_detection/llm-raw-scihub-o.O-bf01427123", True), # 特殊扫描版8,只有3页且全是大图扫描版
("scihub/scihub_53700000/libgen.scimag53724000-53724999.zip_10.1097/00129191-200509000-00018", True), # 特殊文本版,有一长条,但是只有一条
])
def test_classify_by_img_narrow_strips(book_name, expected_bool_classify_by_img_narrow_strips):
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
img_sz_list = test_data[book_name]["expected_image_info"]
docs = get_docs_from_test_pdf(book_name)
median_width, median_height = get_pdf_page_size_pts(docs)
page_width = int(median_width)
page_height = int(median_height)
bool_classify_by_img_narrow_strips = classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
assert bool_classify_by_img_narrow_strips == expected_bool_classify_by_img_narrow_strips
\ No newline at end of file
import os
import pytest
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language
from tests.test_commons import get_docs_from_test_pdf, get_test_json_data
# 获取当前目录
current_directory = os.path.dirname(os.path.abspath(__file__))
'''
获取pdf的宽与高,宽和高各用一个list,分别取中位数
'''
@pytest.mark.parametrize("book_name, expected_width, expected_height",
[
("zlib/zlib_17058115", 795, 1002), # pdf中最大页与最小页差异极大个例
("the_eye/the_eye_wtl_00023799", 616, 785) # 采样的前50页存在中位数大小页面横竖旋转情况
])
def test_get_pdf_page_size_pts(book_name, expected_width, expected_height):
docs = get_docs_from_test_pdf(book_name)
median_width, median_height = get_pdf_page_size_pts(docs)
assert int(median_width) == expected_width
assert int(median_height) == expected_height
'''
获取pdf前50页的图片信息,为了提速,对特殊扫描版1的情况做了过滤,其余情况都正常取图片信息
'''
@pytest.mark.parametrize("book_name",
[
"zlib/zlib_21370453", # 特殊扫描版1,每页都有所有扫描页图片,特点是图占比大,每页展示1至n张
"the_eye/the_eye_cdn_00391653", # 特殊文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张,这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist
"scihub/scihub_08400000/libgen.scimag08489000-08489999.zip_10.1016/0370-1573(90)90070-i", # 扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断
"zlib/zlib_17216416", # 特殊扫描版3,有的页面是一整张大图,有的页面是通过一条条小图拼起来的
"the_eye/the_eye_wtl_00023799", # 特殊扫描版4,每一页都是一张张小图拼出来的
"the_eye/the_eye_cdn_00328381", # 特殊扫描版5,每一页都是一张张小图拼出来的,但是存在多个小图多次重复使用情况
"scihub/scihub_25800000/libgen.scimag25889000-25889999.zip_10.2307/4153991", # 特殊扫描版6,只有3页且其中两页是扫描页
"scanned_detection/llm-raw-scihub-o.O-0584-8539%2891%2980165-f", # 特殊扫描版7,只有一页,且是一张张小图拼出来的
"scanned_detection/llm-raw-scihub-o.O-bf01427123", # 特殊扫描版8,只有3页且全是大图扫描版
"zlib/zlib_22115997", # 特殊扫描版9,类似特1,但是每页数量不完全相等
"zlib/zlib_21814957", # 特殊扫描版10,类似特1,但是每页数量不完全相等
"zlib/zlib_21814955", # 特殊扫描版11,类似特1,但是每页数量不完全相等
"scihub/scihub_41200000/libgen.scimag41253000-41253999.zip_10.1080/00222938709460256", # 特殊扫描版12,头两页文字版且有一页没图片,后面扫描版11页
"scihub/scihub_37000000/libgen.scimag37068000-37068999.zip_10.1080/0015587X.1936.9718622" # 特殊扫描版13,头两页文字版且有一页没图片,后面扫描版3页
])
def test_get_image_info(book_name):
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
docs = get_docs_from_test_pdf(book_name)
page_width_pts, page_height_pts = get_pdf_page_size_pts(docs)
image_info, junk_img_bojids = get_image_info(docs, page_width_pts, page_height_pts)
assert image_info == test_data[book_name]["expected_image_info"]
assert junk_img_bojids == test_data[book_name]["expected_junk_img_bojids"]
'''
获取pdf前50页的文本布局信息,输出list,每个元素为一个页面的横竖排信息
'''
@pytest.mark.parametrize("book_name",
[
"vertical_detection/三国演义_繁体竖排版", # 竖排版本1
"vertical_detection/净空法师_大乘无量寿", # 竖排版本2
"vertical_detection/om3006239", # 横排版本1
"vertical_detection/isit.2006.261791" # 横排版本2
])
def test_get_text_layout_info(book_name):
test_data = get_test_json_data(current_directory, "test_metascan_classify_data.json")
docs = get_docs_from_test_pdf(book_name)
text_layout_info = get_pdf_text_layout_per_page(docs)
assert text_layout_info == test_data[book_name]["expected_text_layout"]
'''
获取pdf的语言信息
'''
@pytest.mark.parametrize("book_name, expected_language",
[
("scihub/scihub_05000000/libgen.scimag05023000-05023999.zip_10.1034/j.1601-0825.2003.02933.x", "en"), # 英文论文
])
def test_get_text_language_info(book_name, expected_language):
docs = get_docs_from_test_pdf(book_name)
text_language = get_language(docs)
assert text_language == expected_language
This diff is collapsed.
This diff is collapsed.
import json
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.model.magic_model import MagicModel
def test_magic_model_image_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_01.pdf')
with open('tests/test_model/assets/test_01.model.json') as f:
model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0])
imgs = magic_model.get_imgs_v2(0)
print(imgs)
tables = magic_model.get_tables_v2(0)
print(tables)
def test_magic_model_table_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_02.pdf')
with open('tests/test_model/assets/test_02.model.json') as f:
model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0])
tables = magic_model.get_tables_v2(5)
print(tables)
tables = magic_model.get_tables_v2(8)
print(tables)
......@@ -7,7 +7,7 @@ from magic_pdf.model.sub_modules.table.tablemaster.tablemaster_paddle import Tab
class TestppTableModel(unittest.TestCase):
def test_image2html(self):
img = Image.open("tests/test_table/assets/table.jpg")
img = Image.open("tests/unittest/test_table/assets/table.jpg")
# 修改table模型路径
config = {"device": "cuda",
"model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"}
......
{"file_location":"tests/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\operatorname{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\cdot}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20~\\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(l)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"^{1\\mathrm{~h~}}"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]}
\ No newline at end of file
[
{
"layout_dets": [
{
"category_id": 1,
"poly": [
882.4013061523438,
169.93817138671875,
1552.350341796875,
169.93817138671875,
1552.350341796875,
625.8263549804688,
882.4013061523438,
625.8263549804688
],
"score": 0.999992311000824
},
{
"category_id": 1,
"poly": [
882.474853515625,
1450.92822265625,
1551.4490966796875,
1450.92822265625,
1551.4490966796875,
1877.5712890625,
882.474853515625,
1877.5712890625
],
"score": 0.9999903440475464
},
{
"category_id": 1,
"poly": [
881.6513061523438,
626.2058715820312,
1552.1400146484375,
626.2058715820312,
1552.1400146484375,
1450.604736328125,
881.6513061523438,
1450.604736328125
],
"score": 0.9999856352806091
},
{
"category_id": 1,
"poly": [
149.41075134277344,
232.1595001220703,
819.0465087890625,
232.1595001220703,
819.0465087890625,
625.8865356445312,
149.41075134277344,
625.8865356445312
],
"score": 0.99998539686203
},
{
"category_id": 1,
"poly": [
149.3945770263672,
1215.5172119140625,
817.8850708007812,
1215.5172119140625,
817.8850708007812,
1304.873291015625,
149.3945770263672,
1304.873291015625
],
"score": 0.9999765157699585
},
{
"category_id": 1,
"poly": [
882.6979370117188,
1880.13916015625,
1552.15185546875,
1880.13916015625,
1552.15185546875,
2031.339599609375,
882.6979370117188,
2031.339599609375
],
"score": 0.9999744892120361
},
{
"category_id": 1,
"poly": [
148.96054077148438,
743.3055419921875,
818.6231689453125,
743.3055419921875,
818.6231689453125,
1074.2369384765625,
148.96054077148438,
1074.2369384765625
],
"score": 0.9999669790267944
},
{
"category_id": 1,
"poly": [
148.8435516357422,
1791.14306640625,
818.6885375976562,
1791.14306640625,
818.6885375976562,
2030.794189453125,
148.8435516357422,
2030.794189453125
],
"score": 0.9999618530273438
},
{
"category_id": 0,
"poly": [
150.7009735107422,
684.0087890625,
623.5106201171875,
684.0087890625,
623.5106201171875,
717.03662109375,
150.7009735107422,
717.03662109375
],
"score": 0.9999415278434753
},
{
"category_id": 8,
"poly": [
146.48068237304688,
1331.6737060546875,
317.2640075683594,
1331.6737060546875,
317.2640075683594,
1400.1722412109375,
146.48068237304688,
1400.1722412109375
],
"score": 0.9998958110809326
},
{
"category_id": 1,
"poly": [
149.42420959472656,
1430.8782958984375,
818.9042358398438,
1430.8782958984375,
818.9042358398438,
1672.7386474609375,
149.42420959472656,
1672.7386474609375
],
"score": 0.9998599290847778
},
{
"category_id": 1,
"poly": [
149.18746948242188,
172.10252380371094,
818.5662231445312,
172.10252380371094,
818.5662231445312,
230.4594268798828,
149.18746948242188,
230.4594268798828
],
"score": 0.9997718334197998
},
{
"category_id": 0,
"poly": [
149.0175018310547,
1732.1090087890625,
702.1005859375,
1732.1090087890625,
702.1005859375,
1763.6046142578125,
149.0175018310547,
1763.6046142578125
],
"score": 0.9997085928916931
},
{
"category_id": 2,
"poly": [
1519.802490234375,
98.59099578857422,
1551.985107421875,
98.59099578857422,
1551.985107421875,
119.48420715332031,
1519.802490234375,
119.48420715332031
],
"score": 0.9995552897453308
},
{
"category_id": 8,
"poly": [
146.9109649658203,
1100.156494140625,
544.2803344726562,
1100.156494140625,
544.2803344726562,
1184.929443359375,
146.9109649658203,
1184.929443359375
],
"score": 0.9995207786560059
},
{
"category_id": 2,
"poly": [
148.11611938476562,
99.87767791748047,
318.926025390625,
99.87767791748047,
318.926025390625,
120.70393371582031,
148.11611938476562,
120.70393371582031
],
"score": 0.999351441860199
},
{
"category_id": 9,
"poly": [
791.7642211914062,
1130.056396484375,
818.6940307617188,
1130.056396484375,
818.6940307617188,
1161.1080322265625,
791.7642211914062,
1161.1080322265625
],
"score": 0.9908884763717651
},
{
"category_id": 9,
"poly": [
788.37060546875,
1346.8450927734375,
818.5010986328125,
1346.8450927734375,
818.5010986328125,
1377.370361328125,
788.37060546875,
1377.370361328125
],
"score": 0.9873985052108765
},
{
"category_id": 14,
"poly": [
146,
1103,
543,
1103,
543,
1184,
146,
1184
],
"score": 0.94,
"latex": "E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"
},
{
"category_id": 13,
"poly": [
1196,
354,
1278,
354,
1278,
384,
1196,
384
],
"score": 0.91,
"latex": "p(1-q)"
},
{
"category_id": 13,
"poly": [
881,
415,
1020,
415,
1020,
444,
881,
444
],
"score": 0.91,
"latex": "(1-p)(1-q)"
},
{
"category_id": 14,
"poly": [
147,
1333,
318,
1333,
318,
1400,
147,
1400
],
"score": 0.91,
"latex": "\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"
},
{
"category_id": 13,
"poly": [
1197,
657,
1263,
657,
1263,
686,
1197,
686
],
"score": 0.9,
"latex": "(1-p)"
},
{
"category_id": 13,
"poly": [
213,
1217,
263,
1217,
263,
1244,
213,
1244
],
"score": 0.88,
"latex": "E[X]"
},
{
"category_id": 13,
"poly": [
214,
1434,
245,
1434,
245,
1459,
214,
1459
],
"score": 0.87,
"latex": "\\upsigma_{H}"
},
{
"category_id": 13,
"poly": [
324,
2002,
373,
2002,
373,
2028,
324,
2028
],
"score": 0.84,
"latex": "30\\%"
},
{
"category_id": 13,
"poly": [
1209,
693,
1225,
693,
1225,
717,
1209,
717
],
"score": 0.83,
"latex": "p"
},
{
"category_id": 13,
"poly": [
990,
449,
1007,
449,
1007,
474,
990,
474
],
"score": 0.81,
"latex": "p"
},
{
"category_id": 13,
"poly": [
346,
1277,
369,
1277,
369,
1301,
346,
1301
],
"score": 0.81,
"latex": "H"
},
{
"category_id": 13,
"poly": [
1137,
661,
1154,
661,
1154,
686,
1137,
686
],
"score": 0.81,
"latex": "p"
},
{
"category_id": 13,
"poly": [
522,
1432,
579,
1432,
579,
1459,
522,
1459
],
"score": 0.81,
"latex": "H\\left(4\\right)"
},
{
"category_id": 13,
"poly": [
944,
540,
962,
540,
962,
565,
944,
565
],
"score": 0.8,
"latex": "p"
},
{
"category_id": 13,
"poly": [
1444,
936,
1461,
936,
1461,
961,
1444,
961
],
"score": 0.79,
"latex": "p"
},
{
"category_id": 13,
"poly": [
602,
1247,
624,
1247,
624,
1270,
602,
1270
],
"score": 0.78,
"latex": "H"
},
{
"category_id": 13,
"poly": [
147,
1247,
167,
1247,
167,
1271,
147,
1271
],
"score": 0.77,
"latex": "X"
},
{
"category_id": 13,
"poly": [
210,
1246,
282,
1246,
282,
1274,
210,
1274
],
"score": 0.77,
"latex": "\\operatorname{CV}(H)"
},
{
"category_id": 13,
"poly": [
1346,
268,
1361,
268,
1361,
292,
1346,
292
],
"score": 0.76,
"latex": "q"
},
{
"category_id": 13,
"poly": [
215,
957,
238,
957,
238,
981,
215,
981
],
"score": 0.74,
"latex": "H"
},
{
"category_id": 13,
"poly": [
149,
956,
173,
956,
173,
981,
149,
981
],
"score": 0.63,
"latex": "W"
},
{
"category_id": 13,
"poly": [
924,
841,
1016,
841,
1016,
868,
924,
868
],
"score": 0.56,
"latex": "8{\\cdot}00\\;\\mathrm{a.m}"
},
{
"category_id": 13,
"poly": [
956,
871,
1032,
871,
1032,
898,
956,
898
],
"score": 0.43,
"latex": "20~\\mathrm{min}"
},
{
"category_id": 13,
"poly": [
1082,
781,
1112,
781,
1112,
808,
1082,
808
],
"score": 0.41,
"latex": "(l)"
},
{
"category_id": 13,
"poly": [
697,
1821,
734,
1821,
734,
1847,
697,
1847
],
"score": 0.3,
"latex": "^{1\\mathrm{~h~}}"
}
],
"page_info": {
"page_no": 0,
"height": 2200,
"width": 1700
}
}
]
\ No newline at end of file
import tempfile
import os
import shutil
from click.testing import CliRunner
from magic_pdf.tools.cli import cli
def test_cli_pdf():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "cli_test_01"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
# run
runner = CliRunner()
result = runner.invoke(
cli,
[
"-p",
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
"-o",
temp_output_dir,
],
)
# check
assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
# teardown
shutil.rmtree(temp_output_dir)
def test_cli_path():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
# run
runner = CliRunner()
result = runner.invoke(
cli, ["-p", "tests/test_tools/assets/cli/path", "-o", temp_output_dir]
)
# check
assert result.exit_code == 0
filename = "cli_test_01"
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
base_output_dir = os.path.join(temp_output_dir, "cli_test_02/auto")
filename = "cli_test_02"
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
# teardown
shutil.rmtree(temp_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment