Unverified Commit 85a4750d authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #3026 from Sidney233/dev

Dev
parents 206ed770 a7e75dc0
This diff is collapsed.
"""
bench
"""
import os
import shutil
import json
from lib import calculate_score
import pytest
from conf import conf
code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"]
class TestBench():
"""
test bench
"""
def test_ci_ben(self):
"""
ci benchmark
"""
fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
now_score = get_score()
print ("now_score:", now_score)
if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
fw.write(json.dumps(now_score) + "\n")
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
def get_score():
"""
get score
"""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
score.calculate_similarity_total("mineru", pdf_dev_path)
res = score.summary_scores()
return res
import json
import os
import shutil
from conf import conf
from lib import calculate_score
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
class TestCliCuda:
"""test cli cuda."""
def test_pdf_sdk_cuda(self):
"""pdf sdk cuda."""
clean_magicpdf(pdf_res_path)
pdf_to_markdown()
fr = open(os.path.join(pdf_dev_path, 'result.json'), 'r', encoding='utf-8')
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
last_simscore = last_score['average_sim_score']
last_editdistance = last_score['average_edit_distance']
last_bleu = last_score['average_bleu_score']
os.system(f'python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}')
now_score = get_score()
print ('now_score:', now_score)
if not os.path.exists(os.path.join(pdf_dev_path, 'ci')):
os.makedirs(os.path.join(pdf_dev_path, 'ci'), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, 'ci', 'result.json'), 'w+', encoding='utf-8')
fw.write(json.dumps(now_score) + '\n')
now_simscore = now_score['average_sim_score']
now_editdistance = now_score['average_edit_distance']
now_bleu = now_score['average_bleu_score']
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
def pdf_to_markdown():
"""pdf to md."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
cmd = 'magic-pdf pdf-command --pdf %s --inside_model true' % (pdf_path)
os.system(cmd)
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f'{demo_name}.md')
src_path = os.path.join(pdf_res_path, demo_name, 'auto', f'{demo_name}.md')
shutil.copy(src_path, res_path)
def get_score():
"""get score."""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, 'result.json'))
score.calculate_similarity_total('mineru', pdf_dev_path)
res = score.summary_scores()
return res
def clean_magicpdf(pdf_res_path):
"""clean magicpdf."""
cmd = 'rm -rf %s' % (pdf_res_path)
os.system(cmd)
"""test cli and sdk."""
import logging
import os
import pytest
from conf import conf
from lib import common
import time
import magic_pdf.model as model_config
from magic_pdf.data.read_api import read_local_images
from magic_pdf.data.read_api import read_local_office
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
magic_pdf_config = "/home/quyuan/magic-pdf.json"
class TestCli:
"""test cli."""
@pytest.fixture(autouse=True)
def setup(self):
"""
init
"""
common.clear_gpu_memory()
common.update_config_file(magic_pdf_config, "device-mode", "cuda")
# 这里可以添加任何前置操作
yield
@pytest.mark.P0
def test_pdf_local_sdk(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_path)
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
common.delete_file(dir_path)
### draw model result on each page
infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
### dump markdown
md_content = pipe_result.get_markdown(image_dir)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_s3_sdk(self):
"""pdf s3 sdk test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
pass
@pytest.mark.P0
def test_pdf_local_ppt(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'ppt')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pptx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_local_image(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.jpg'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(dir_path)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_images(pdf_path)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_local_image_dir(self):
"""local image dir."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
dir_path = os.path.join(pdf_dev_path, 'mineru')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
common.delete_file(dir_path)
dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
count += 1
common.sdk_count_folders_and_check_contents(dir_path)
def test_local_doc_parse(self):
"""
doc 解析
"""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'doc')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.docx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_cli_auto(self):
"""magic_pdf cli test auto."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'auto'))
@pytest.mark.P0
def test_pdf_cli_txt(self):
"""magic_pdf cli test txt."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'txt'))
@pytest.mark.P0
def test_pdf_cli_ocr(self):
"""magic_pdf cli test ocr."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'ocr'))
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_txt(self):
"""magic_pdf_dev cli local txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_ocr(self):
"""magic_pdf_dev cli local ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_auto(self):
"""magic_pdf_dev cli local auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_txt(self):
"""magic_pdf_dev cli s3 txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_ocr(self):
"""magic_pdf_dev cli s3 ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_auto(self):
"""magic_pdf_dev cli s3 auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_pdf_json_auto(self):
"""magic_pdf_dev cli pdf+json auto."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_pdf_json_ocr(self):
"""magic_pdf_dev cli pdf+json ocr."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_local_magic_pdf_open_rapidai_table(self):
"""magic pdf cli open rapid ai table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_doclayout_yolo(self):
"""magic pdf cli open doclyaout yolo."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "doclayout_yolo"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.skip(reason="layoutlmv3废弃")
@pytest.mark.P1
def test_local_magic_pdf_layoutlmv3_yolo(self):
"""magic pdf cli open layoutlmv3."""
time.sleep(2)
value = {
"model": "layoutlmv3"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
#res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
@pytest.mark.P1
def test_magic_pdf_cpu(self):
"""magic pdf cli cpu mode."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
common.update_config_file(magic_pdf_config, "device-mode", "cpu")
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.P1
def test_local_magic_pdf_close_html_table(self):
"""magic pdf cli close table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": False,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
if __name__ == '__main__':
pytest.main()
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/unittest/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
import os
import shutil
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
def test_filebased_reader_writer():
unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
sub_dir = os.path.join(unitest_dir, 'sub')
abs_fn = os.path.join(unitest_dir, 'abspath.txt')
os.makedirs(sub_dir, exist_ok=True)
writer = FileBasedDataWriter(sub_dir)
reader = FileBasedDataReader(sub_dir)
writer.write('test.txt', b'hello world')
assert reader.read('test.txt') == b'hello world'
writer.write(abs_fn, b'hello world')
assert reader.read(abs_fn) == b'hello world'
shutil.rmtree(unitest_dir)
import json
import os
import fitz
import pytest
from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
writer = MultiBucketS3DataWriter(bucket, s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
prefix = 'meta-index'
reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader('', bucket, ak, sk, endpoint_url)
writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
prefix = 'meta-index'
reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.io.s3 import S3Reader, S3Writer
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_reader():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert len(bits) > 0
bits = reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
566,
713,
)
assert len(json.loads(bits)) > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_writer():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
test_fn = 'unittest/io/test.jsonl'
writer.write(test_fn, '123'.encode())
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(test_fn)
assert bits.decode() == '123'
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/unittest/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/unittest/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
import pytest
import json
from magic_pdf.libs.json_compressor import JsonCompressor
# Test data fixtures
@pytest.fixture
def test_cases():
return [
# Simple dictionary
{"name": "John", "age": 30},
# Nested dictionary
{
"person": {
"name": "Alice",
"address": {
"street": "123 Main St",
"city": "New York"
}
}
},
# List of dictionaries
[
{"id": 1, "value": "first"},
{"id": 2, "value": "second"}
],
# Dictionary with various data types
{
"string": "hello",
"integer": 42,
"float": 3.14,
"boolean": True,
"null": None,
"array": [1, 2, 3],
"nested": {"key": "value"}
},
# Empty structures
{},
[],
{"empty_list": [], "empty_dict": {}}
]
@pytest.fixture
def large_data():
return {
"data": ["test" * 100] * 100 # Create a large repeated string
}
def test_compression_decompression_cycle(test_cases):
"""Test that data remains intact after compression and decompression"""
for test_data in test_cases:
# Compress the data
compressed = JsonCompressor.compress_json(test_data)
# Verify compressed string is not empty and is a string
assert isinstance(compressed, str)
assert len(compressed) > 0
# Decompress the data
decompressed = JsonCompressor.decompress_json(compressed)
# Verify the decompressed data matches original
assert test_data == decompressed
def test_compression_reduces_size(large_data):
"""Test that compression actually reduces data size for large enough input"""
original_size = len(json.dumps(large_data))
compressed = JsonCompressor.compress_json(large_data)
compressed_size = len(compressed)
# Verify compression actually saved space
assert compressed_size < original_size
def test_invalid_json_serializable():
"""Test handling of non-JSON serializable input"""
with pytest.raises(TypeError):
JsonCompressor.compress_json(set([1, 2, 3])) # sets are not JSON serializable
def test_invalid_compressed_string():
"""Test handling of invalid compressed string"""
with pytest.raises(Exception):
JsonCompressor.decompress_json("invalid_base64_string")
def test_empty_string_input():
"""Test handling of empty string input"""
with pytest.raises(Exception):
JsonCompressor.decompress_json("")
def test_special_characters():
"""Test handling of special characters"""
test_data = {
"special": "!@#$%^&*()_+-=[]{}|;:,.<>?",
"unicode": "Hello 世界 🌍"
}
compressed = JsonCompressor.compress_json(test_data)
decompressed = JsonCompressor.decompress_json(compressed)
assert test_data == decompressed
# Parametrized test for different types of input
@pytest.mark.parametrize("test_input", [
{"simple": "value"},
[1, 2, 3],
{"nested": {"key": "value"}},
["mixed", 1, True, None],
{"unicode": "🌍"}
])
def test_various_input_types(test_input):
"""Test compression and decompression with various input types"""
compressed = JsonCompressor.compress_json(test_input)
decompressed = JsonCompressor.decompress_json(compressed)
assert test_input == decompressed
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment