Commit 6a481320 authored by icecraft's avatar icecraft
Browse files

fix: using new data api replace old rw api

parent 7b197fe2
import tempfile
import os import os
import shutil import shutil
import tempfile
from click.testing import CliRunner from click.testing import CliRunner
from magic_pdf.tools import cli_dev from magic_pdf.tools import cli_dev
...@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev ...@@ -8,22 +9,22 @@ from magic_pdf.tools import cli_dev
def test_cli_pdf(): def test_cli_pdf():
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "cli_test_01" filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run # run
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
cli_dev.cli, cli_dev.cli,
[ [
"pdf", 'pdf',
"-p", '-p',
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf", 'tests/unittest/test_tools/assets/cli/pdf/cli_test_01.pdf',
"-j", '-j',
"tests/test_tools/assets/cli_dev/cli_test_01.model.json", 'tests/unittest/test_tools/assets/cli_dev/cli_test_01.model.json',
"-o", '-o',
temp_output_dir, temp_output_dir,
], ],
) )
...@@ -31,31 +32,30 @@ def test_cli_pdf(): ...@@ -31,31 +32,30 @@ def test_cli_pdf():
# check # check
assert result.exit_code == 0 assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, "content_list.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
...@@ -63,26 +63,26 @@ def test_cli_pdf(): ...@@ -63,26 +63,26 @@ def test_cli_pdf():
def test_cli_jsonl(): def test_cli_jsonl():
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "cli_test_01" filename = 'cli_test_01'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
def mock_read_s3_path(s3path): def mock_read_s3_path(s3path):
with open(s3path, "rb") as f: with open(s3path, 'rb') as f:
return f.read() return f.read()
cli_dev.read_s3_path = mock_read_s3_path # mock cli_dev.read_s3_path = mock_read_s3_path # mock
# run # run
runner = CliRunner() runner = CliRunner()
result = runner.invoke( result = runner.invoke(
cli_dev.cli, cli_dev.cli,
[ [
"jsonl", 'jsonl',
"-j", '-j',
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl", 'tests/unittest/test_tools/assets/cli_dev/cli_test_01.jsonl',
"-o", '-o',
temp_output_dir, temp_output_dir,
], ],
) )
...@@ -90,31 +90,31 @@ def test_cli_jsonl(): ...@@ -90,31 +90,31 @@ def test_cli_jsonl():
# check # check
assert result.exit_code == 0 assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto") base_output_dir = os.path.join(temp_output_dir, 'cli_test_01/auto')
r = os.stat(os.path.join(base_output_dir, "content_list.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True assert os.path.exists(os.path.join(base_output_dir, 'images')) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True assert os.path.isdir(os.path.join(base_output_dir, 'images')) is True
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
import tempfile
import os import os
import shutil import shutil
import tempfile
import pytest import pytest
from magic_pdf.tools.common import do_parse from magic_pdf.tools.common import do_parse
@pytest.mark.parametrize("method", ["auto", "txt", "ocr"]) @pytest.mark.parametrize('method', ['auto', 'txt', 'ocr'])
def test_common_do_parse(method): def test_common_do_parse(method):
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
# setup # setup
unitest_dir = "/tmp/magic_pdf/unittest/tools" unitest_dir = '/tmp/magic_pdf/unittest/tools'
filename = "fake" filename = 'fake'
os.makedirs(unitest_dir, exist_ok=True) os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools") temp_output_dir = tempfile.mkdtemp(dir='/tmp/magic_pdf/unittest/tools')
# run # run
with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f: with open('tests/unittest/test_tools/assets/common/cli_test_01.pdf', 'rb') as f:
bits = f.read() bits = f.read()
do_parse(temp_output_dir, do_parse(temp_output_dir,
filename, filename,
...@@ -27,31 +29,31 @@ def test_common_do_parse(method): ...@@ -27,31 +29,31 @@ def test_common_do_parse(method):
f_dump_content_list=True) f_dump_content_list=True)
# check # check
base_output_dir = os.path.join(temp_output_dir, f"fake/{method}") base_output_dir = os.path.join(temp_output_dir, f'fake/{method}')
r = os.stat(os.path.join(base_output_dir, "content_list.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_content_list.json'))
assert r.st_size > 5000 assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md")) r = os.stat(os.path.join(base_output_dir, f'{filename}.md'))
assert r.st_size > 7000 assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_middle.json'))
assert r.st_size > 200000 assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json")) r = os.stat(os.path.join(base_output_dir, f'{filename}_model.json'))
assert r.st_size > 15000 assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_origin.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "layout.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_layout.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
r = os.stat(os.path.join(base_output_dir, "spans.pdf")) r = os.stat(os.path.join(base_output_dir, f'{filename}_spans.pdf'))
assert r.st_size > 500000 assert r.st_size > 400000
os.path.exists(os.path.join(base_output_dir, "images")) os.path.exists(os.path.join(base_output_dir, 'images'))
os.path.isdir(os.path.join(base_output_dir, "images")) os.path.isdir(os.path.join(base_output_dir, 'images'))
# teardown # teardown
shutil.rmtree(temp_output_dir) shutil.rmtree(temp_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment