Unverified Commit 8442ed39 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1006 from icecraft/fix/data_compatiable

fix: using new data api replace old rw api
parents bf7d2c4f 6a481320
import tempfile
import os
import shutil
import tempfile
from click.testing import CliRunner
from magic_pdf.tools import cli_dev
......@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev
def test_cli_pdf():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "cli_test_01"
unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run
runner = CliRunner()
result = runner.invoke(
cli_dev.cli,
[
"pdf",
"-p",
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
"-j",
"tests/test_tools/assets/cli_dev/cli_test_01.model.json",
"-o",
'pdf',
'-p',
'tests/unittest/test_tools/assets/cli/pdf/cli_test_01.pdf',
'-j',
'tests/unittest/test_tools/assets/cli_dev/cli_test_01.model.json',
'-o',
temp_output_dir,
],
)
......@@ -31,31 +32,30 @@ def test_cli_pdf():
# check
assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, "content_list.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
# teardown
shutil.rmtree(temp_output_dir)
......@@ -63,26 +63,26 @@ def test_cli_pdf():
def test_cli_jsonl():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "cli_test_01"
unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
def mock_read_s3_path(s3path):
with open(s3path, "rb") as f:
with open(s3path, 'rb') as f:
return f.read()
cli_dev.read_s3_path = mock_read_s3_path # mock
cli_dev.read_s3_path = mock_read_s3_path # mock
# run
runner = CliRunner()
result = runner.invoke(
cli_dev.cli,
[
"jsonl",
"-j",
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl",
"-o",
'jsonl',
'-j',
'tests/unittest/test_tools/assets/cli_dev/cli_test_01.jsonl',
'-o',
temp_output_dir,
],
)
......@@ -90,31 +90,31 @@ def test_cli_jsonl():
# check
assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, "content_list.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
# teardown
shutil.rmtree(temp_output_dir)
import tempfile
import os
import shutil
import tempfile
import pytest
from magic_pdf.tools.common import do_parse
@pytest.mark.parametrize("method", ["auto", "txt", "ocr"])
@pytest.mark.parametrize('method', ['auto', 'txt', 'ocr'])
def test_common_do_parse(method):
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "fake"
unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = 'fake'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run
with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f:
with open('tests/unittest/test_tools/assets/common/cli_test_01.pdf', 'rb') as f:
bits = f.read()
do_parse(temp_output_dir,
filename,
......@@ -27,31 +29,31 @@ def test_common_do_parse(method):
f_dump_content_list=True)
# check
base_output_dir = os.path.join(temp_output_dir, f"fake/{method}")
base_output_dir = os.path.join(temp_output_dir, f'fake/{method}')
r = os.stat(os.path.join(base_output_dir, "content_list.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 400000
os.path.exists(os.path.join(base_output_dir, "images"))
os.path.isdir(os.path.join(base_output_dir, "images"))
os.path.exists(os.path.join(base_output_dir, 'images'))
os.path.isdir(os.path.join(base_output_dir, 'images'))
# teardown
shutil.rmtree(temp_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment