Unverified Commit 158e556b authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1063 from opendatalab/release-0.10.0

Release 0.10.0
parents 038f48d3 30be5017
...@@ -52,6 +52,9 @@ if __name__ == '__main__': ...@@ -52,6 +52,9 @@ if __name__ == '__main__':
"PyYAML", # yaml "PyYAML", # yaml
"detectron2" "detectron2"
], ],
"old_linux":[
"albumentations<=1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
]
}, },
description="A practical tool for converting PDF to Markdown", # 简短描述 description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述 long_description=long_description, # 详细描述
......
...@@ -8,6 +8,9 @@ while true; do ...@@ -8,6 +8,9 @@ while true; do
#python -m pip install -r requirements-qa.txt #python -m pip install -r requirements-qa.txt
python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py
python download_models.py
exit_code=$? exit_code=$?
if [ $exit_code -eq 0 ]; then if [ $exit_code -eq 0 ]; then
echo "test.sh 成功执行!" echo "test.sh 成功执行!"
......
...@@ -8,10 +8,3 @@ def clear_gpu_memory(): ...@@ -8,10 +8,3 @@ def clear_gpu_memory():
torch.cuda.empty_cache() torch.cuda.empty_cache()
print("GPU memory cleared.") print("GPU memory cleared.")
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_teardown(item, nextitem):
'''
clear GPU memory after each test
'''
yield
clear_gpu_memory()
\ No newline at end of file
...@@ -3,6 +3,15 @@ import os ...@@ -3,6 +3,15 @@ import os
import shutil import shutil
import re import re
import json import json
import torch
def clear_gpu_memory():
'''
clear GPU memory
'''
torch.cuda.empty_cache()
print("GPU memory cleared.")
def check_shell(cmd): def check_shell(cmd):
"""shell successful.""" """shell successful."""
res = os.system(cmd) res = os.system(cmd)
...@@ -10,11 +19,12 @@ def check_shell(cmd): ...@@ -10,11 +19,12 @@ def check_shell(cmd):
def update_config_file(file_path, key, value): def update_config_file(file_path, key, value):
"""update config file.""" """update config file."""
with open(file_path, 'r', encoding="utf-8") as f: with open(file_path, 'r', encoding="utf-8") as fr:
config = json.loads(f.read()) config = json.loads(fr.read())
config[key] = value config[key] = value
with open(file_path, 'w', encoding="utf-8") as f: # 保存修改后的内容
f.write(json.dumps(config)) with open(file_path, 'w', encoding='utf-8') as fw:
json.dump(config, fw, ensure_ascii=False, indent=4)
def cli_count_folders_and_check_contents(file_path): def cli_count_folders_and_check_contents(file_path):
"""" count cli files.""" """" count cli files."""
...@@ -33,6 +43,7 @@ def sdk_count_folders_and_check_contents(file_path): ...@@ -33,6 +43,7 @@ def sdk_count_folders_and_check_contents(file_path):
exit(1) exit(1)
def delete_file(path): def delete_file(path):
"""delete file.""" """delete file."""
if not os.path.exists(path): if not os.path.exists(path):
......
...@@ -13,10 +13,19 @@ model_config.__use_inside_model__ = True ...@@ -13,10 +13,19 @@ model_config.__use_inside_model__ = True
pdf_res_path = conf.conf['pdf_res_path'] pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path'] code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path'] pdf_dev_path = conf.conf['pdf_dev_path']
magic_pdf_config = "/home/quyuan/magic-pdf.json"
class TestCli: class TestCli:
"""test cli.""" """test cli."""
@pytest.fixture(autouse=True)
def setup(self):
"""
init
"""
common.clear_gpu_memory()
common.update_config_file(magic_pdf_config, "device-mode", "cuda")
# 这里可以添加任何前置操作
yield
@pytest.mark.P0 @pytest.mark.P0
def test_pdf_auto_sdk(self): def test_pdf_auto_sdk(self):
...@@ -291,22 +300,32 @@ class TestCli: ...@@ -291,22 +300,32 @@ class TestCli:
def test_local_magic_pdf_open_st_table(self): def test_local_magic_pdf_open_st_table(self):
"""magic pdf cli open st table.""" """magic pdf cli open st table."""
time.sleep(2) time.sleep(2)
pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json" #pre_cmd = "cp ~/magic_pdf_st.json ~/magic-pdf.json"
print (pre_cmd) value = {
os.system(pre_cmd) "model": "struct_eqtable",
"enable": True,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path) common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd) os.system(cli_cmd)
res = common.check_latex_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md")) res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True assert res is True
@pytest.mark.P1 @pytest.mark.P1
def test_local_magic_pdf_open_html_table(self): def test_local_magic_pdf_open_tablemaster_cuda(self):
"""magic pdf cli open html table.""" """magic pdf cli open table master html table cuda mode."""
time.sleep(2) time.sleep(2)
pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json" #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
os.system(pre_cmd) #os.system(pre_cmd)
value = {
"model": "tablemaster",
"enable": True,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path) common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
...@@ -315,24 +334,88 @@ class TestCli: ...@@ -315,24 +334,88 @@ class TestCli:
assert res is True assert res is True
@pytest.mark.P1 @pytest.mark.P1
def test_magic_pdf_close_html_table_cpu(self): def test_local_magic_pdf_open_rapidai_table(self):
"""magic pdf cli close html table cpu mode.""" """magic pdf cli open rapid ai table."""
time.sleep(2) time.sleep(2)
pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json" #pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
os.system(pre_cmd) #os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path) common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd) os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md")) res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True assert res is True
@pytest.mark.P1
def test_local_magic_pdf_doclayout_yolo(self):
"""magic pdf cli open doclyaout yolo."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "doclayout_yolo"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.P1
def test_local_magic_pdf_layoutlmv3_yolo(self):
"""magic pdf cli open layoutlmv3."""
time.sleep(2)
value = {
"model": "layoutlmv3"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
#res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
@pytest.mark.P1
def test_magic_pdf_cpu(self):
"""magic pdf cli cpu mode."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "tablemaster",
"enable": False,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
common.update_config_file(magic_pdf_config, "device-mode", "cpu")
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.P1 @pytest.mark.P1
def test_local_magic_pdf_close_html_table(self): def test_local_magic_pdf_close_html_table(self):
"""magic pdf cli close table.""" """magic pdf cli close table."""
time.sleep(2) time.sleep(2)
pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json" #pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
os.system(pre_cmd) #os.system(pre_cmd)
value = {
"model": "tablemaster",
"enable": False,
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf") pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path) common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path) cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
......
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/unittest/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
import os
import shutil
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
def test_filebased_reader_writer():
unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
sub_dir = os.path.join(unitest_dir, 'sub')
abs_fn = os.path.join(unitest_dir, 'abspath.txt')
os.makedirs(sub_dir, exist_ok=True)
writer = FileBasedDataWriter(sub_dir)
reader = FileBasedDataReader(sub_dir)
writer.write('test.txt', b'hello world')
assert reader.read('test.txt') == b'hello world'
writer.write(abs_fn, b'hello world')
assert reader.read(abs_fn) == b'hello world'
shutil.rmtree(unitest_dir)
import json
import os
import fitz
import pytest
from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
writer = MultiBucketS3DataWriter(bucket, s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
prefix = 'meta-index'
reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader('', bucket, ak, sk, endpoint_url)
writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
prefix = 'meta-index'
reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.io.s3 import S3Reader, S3Writer
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_reader():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert len(bits) > 0
bits = reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
566,
713,
)
assert len(json.loads(bits)) > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_writer():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
test_fn = 'unittest/io/test.jsonl'
writer.write(test_fn, '123'.encode())
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(test_fn)
assert bits.decode() == '123'
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/unittest/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/unittest/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
import pytest
import json
from magic_pdf.libs.json_compressor import JsonCompressor
# Test data fixtures
@pytest.fixture
def test_cases():
return [
# Simple dictionary
{"name": "John", "age": 30},
# Nested dictionary
{
"person": {
"name": "Alice",
"address": {
"street": "123 Main St",
"city": "New York"
}
}
},
# List of dictionaries
[
{"id": 1, "value": "first"},
{"id": 2, "value": "second"}
],
# Dictionary with various data types
{
"string": "hello",
"integer": 42,
"float": 3.14,
"boolean": True,
"null": None,
"array": [1, 2, 3],
"nested": {"key": "value"}
},
# Empty structures
{},
[],
{"empty_list": [], "empty_dict": {}}
]
@pytest.fixture
def large_data():
return {
"data": ["test" * 100] * 100 # Create a large repeated string
}
def test_compression_decompression_cycle(test_cases):
"""Test that data remains intact after compression and decompression"""
for test_data in test_cases:
# Compress the data
compressed = JsonCompressor.compress_json(test_data)
# Verify compressed string is not empty and is a string
assert isinstance(compressed, str)
assert len(compressed) > 0
# Decompress the data
decompressed = JsonCompressor.decompress_json(compressed)
# Verify the decompressed data matches original
assert test_data == decompressed
def test_compression_reduces_size(large_data):
"""Test that compression actually reduces data size for large enough input"""
original_size = len(json.dumps(large_data))
compressed = JsonCompressor.compress_json(large_data)
compressed_size = len(compressed)
# Verify compression actually saved space
assert compressed_size < original_size
def test_invalid_json_serializable():
"""Test handling of non-JSON serializable input"""
with pytest.raises(TypeError):
JsonCompressor.compress_json(set([1, 2, 3])) # sets are not JSON serializable
def test_invalid_compressed_string():
"""Test handling of invalid compressed string"""
with pytest.raises(Exception):
JsonCompressor.decompress_json("invalid_base64_string")
def test_empty_string_input():
"""Test handling of empty string input"""
with pytest.raises(Exception):
JsonCompressor.decompress_json("")
def test_special_characters():
"""Test handling of special characters"""
test_data = {
"special": "!@#$%^&*()_+-=[]{}|;:,.<>?",
"unicode": "Hello 世界 🌍"
}
compressed = JsonCompressor.compress_json(test_data)
decompressed = JsonCompressor.decompress_json(compressed)
assert test_data == decompressed
# Parametrized test for different types of input
@pytest.mark.parametrize("test_input", [
{"simple": "value"},
[1, 2, 3],
{"nested": {"key": "value"}},
["mixed", 1, True, None],
{"unicode": "🌍"}
])
def test_various_input_types(test_input):
"""Test compression and decompression with various input types"""
compressed = JsonCompressor.compress_json(test_input)
decompressed = JsonCompressor.decompress_json(compressed)
assert test_input == decompressed
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment