"backends/vscode:/vscode.git/clone" did not exist on "136bcc812870e36aea69c3bb9cb8012f0a63d973"
Unverified Commit 40e0827e authored by icecraft's avatar icecraft Committed by GitHub
Browse files

Feat/impl cli (#264)



* feat: refractor cli command

* feat: add docs to describe the output files of cli

* feat: resove review comments

* feat: updat docs about middle.json

---------
Co-authored-by: default avatarshenguanlin <shenguanlin@pjlab.org.cn>
parent 5e38c4c8
{"file_location":"tests/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\operatorname{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\cdot}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20~\\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(l)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"^{1\\mathrm{~h~}}"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]}
\ No newline at end of file
[
{
"layout_dets": [
{
"category_id": 1,
"poly": [
882.4013061523438,
169.93817138671875,
1552.350341796875,
169.93817138671875,
1552.350341796875,
625.8263549804688,
882.4013061523438,
625.8263549804688
],
"score": 0.999992311000824
},
{
"category_id": 1,
"poly": [
882.474853515625,
1450.92822265625,
1551.4490966796875,
1450.92822265625,
1551.4490966796875,
1877.5712890625,
882.474853515625,
1877.5712890625
],
"score": 0.9999903440475464
},
{
"category_id": 1,
"poly": [
881.6513061523438,
626.2058715820312,
1552.1400146484375,
626.2058715820312,
1552.1400146484375,
1450.604736328125,
881.6513061523438,
1450.604736328125
],
"score": 0.9999856352806091
},
{
"category_id": 1,
"poly": [
149.41075134277344,
232.1595001220703,
819.0465087890625,
232.1595001220703,
819.0465087890625,
625.8865356445312,
149.41075134277344,
625.8865356445312
],
"score": 0.99998539686203
},
{
"category_id": 1,
"poly": [
149.3945770263672,
1215.5172119140625,
817.8850708007812,
1215.5172119140625,
817.8850708007812,
1304.873291015625,
149.3945770263672,
1304.873291015625
],
"score": 0.9999765157699585
},
{
"category_id": 1,
"poly": [
882.6979370117188,
1880.13916015625,
1552.15185546875,
1880.13916015625,
1552.15185546875,
2031.339599609375,
882.6979370117188,
2031.339599609375
],
"score": 0.9999744892120361
},
{
"category_id": 1,
"poly": [
148.96054077148438,
743.3055419921875,
818.6231689453125,
743.3055419921875,
818.6231689453125,
1074.2369384765625,
148.96054077148438,
1074.2369384765625
],
"score": 0.9999669790267944
},
{
"category_id": 1,
"poly": [
148.8435516357422,
1791.14306640625,
818.6885375976562,
1791.14306640625,
818.6885375976562,
2030.794189453125,
148.8435516357422,
2030.794189453125
],
"score": 0.9999618530273438
},
{
"category_id": 0,
"poly": [
150.7009735107422,
684.0087890625,
623.5106201171875,
684.0087890625,
623.5106201171875,
717.03662109375,
150.7009735107422,
717.03662109375
],
"score": 0.9999415278434753
},
{
"category_id": 8,
"poly": [
146.48068237304688,
1331.6737060546875,
317.2640075683594,
1331.6737060546875,
317.2640075683594,
1400.1722412109375,
146.48068237304688,
1400.1722412109375
],
"score": 0.9998958110809326
},
{
"category_id": 1,
"poly": [
149.42420959472656,
1430.8782958984375,
818.9042358398438,
1430.8782958984375,
818.9042358398438,
1672.7386474609375,
149.42420959472656,
1672.7386474609375
],
"score": 0.9998599290847778
},
{
"category_id": 1,
"poly": [
149.18746948242188,
172.10252380371094,
818.5662231445312,
172.10252380371094,
818.5662231445312,
230.4594268798828,
149.18746948242188,
230.4594268798828
],
"score": 0.9997718334197998
},
{
"category_id": 0,
"poly": [
149.0175018310547,
1732.1090087890625,
702.1005859375,
1732.1090087890625,
702.1005859375,
1763.6046142578125,
149.0175018310547,
1763.6046142578125
],
"score": 0.9997085928916931
},
{
"category_id": 2,
"poly": [
1519.802490234375,
98.59099578857422,
1551.985107421875,
98.59099578857422,
1551.985107421875,
119.48420715332031,
1519.802490234375,
119.48420715332031
],
"score": 0.9995552897453308
},
{
"category_id": 8,
"poly": [
146.9109649658203,
1100.156494140625,
544.2803344726562,
1100.156494140625,
544.2803344726562,
1184.929443359375,
146.9109649658203,
1184.929443359375
],
"score": 0.9995207786560059
},
{
"category_id": 2,
"poly": [
148.11611938476562,
99.87767791748047,
318.926025390625,
99.87767791748047,
318.926025390625,
120.70393371582031,
148.11611938476562,
120.70393371582031
],
"score": 0.999351441860199
},
{
"category_id": 9,
"poly": [
791.7642211914062,
1130.056396484375,
818.6940307617188,
1130.056396484375,
818.6940307617188,
1161.1080322265625,
791.7642211914062,
1161.1080322265625
],
"score": 0.9908884763717651
},
{
"category_id": 9,
"poly": [
788.37060546875,
1346.8450927734375,
818.5010986328125,
1346.8450927734375,
818.5010986328125,
1377.370361328125,
788.37060546875,
1377.370361328125
],
"score": 0.9873985052108765
},
{
"category_id": 14,
"poly": [
146,
1103,
543,
1103,
543,
1184,
146,
1184
],
"score": 0.94,
"latex": "E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"
},
{
"category_id": 13,
"poly": [
1196,
354,
1278,
354,
1278,
384,
1196,
384
],
"score": 0.91,
"latex": "p(1-q)"
},
{
"category_id": 13,
"poly": [
881,
415,
1020,
415,
1020,
444,
881,
444
],
"score": 0.91,
"latex": "(1-p)(1-q)"
},
{
"category_id": 14,
"poly": [
147,
1333,
318,
1333,
318,
1400,
147,
1400
],
"score": 0.91,
"latex": "\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"
},
{
"category_id": 13,
"poly": [
1197,
657,
1263,
657,
1263,
686,
1197,
686
],
"score": 0.9,
"latex": "(1-p)"
},
{
"category_id": 13,
"poly": [
213,
1217,
263,
1217,
263,
1244,
213,
1244
],
"score": 0.88,
"latex": "E[X]"
},
{
"category_id": 13,
"poly": [
214,
1434,
245,
1434,
245,
1459,
214,
1459
],
"score": 0.87,
"latex": "\\upsigma_{H}"
},
{
"category_id": 13,
"poly": [
324,
2002,
373,
2002,
373,
2028,
324,
2028
],
"score": 0.84,
"latex": "30\\%"
},
{
"category_id": 13,
"poly": [
1209,
693,
1225,
693,
1225,
717,
1209,
717
],
"score": 0.83,
"latex": "p"
},
{
"category_id": 13,
"poly": [
990,
449,
1007,
449,
1007,
474,
990,
474
],
"score": 0.81,
"latex": "p"
},
{
"category_id": 13,
"poly": [
346,
1277,
369,
1277,
369,
1301,
346,
1301
],
"score": 0.81,
"latex": "H"
},
{
"category_id": 13,
"poly": [
1137,
661,
1154,
661,
1154,
686,
1137,
686
],
"score": 0.81,
"latex": "p"
},
{
"category_id": 13,
"poly": [
522,
1432,
579,
1432,
579,
1459,
522,
1459
],
"score": 0.81,
"latex": "H\\left(4\\right)"
},
{
"category_id": 13,
"poly": [
944,
540,
962,
540,
962,
565,
944,
565
],
"score": 0.8,
"latex": "p"
},
{
"category_id": 13,
"poly": [
1444,
936,
1461,
936,
1461,
961,
1444,
961
],
"score": 0.79,
"latex": "p"
},
{
"category_id": 13,
"poly": [
602,
1247,
624,
1247,
624,
1270,
602,
1270
],
"score": 0.78,
"latex": "H"
},
{
"category_id": 13,
"poly": [
147,
1247,
167,
1247,
167,
1271,
147,
1271
],
"score": 0.77,
"latex": "X"
},
{
"category_id": 13,
"poly": [
210,
1246,
282,
1246,
282,
1274,
210,
1274
],
"score": 0.77,
"latex": "\\operatorname{CV}(H)"
},
{
"category_id": 13,
"poly": [
1346,
268,
1361,
268,
1361,
292,
1346,
292
],
"score": 0.76,
"latex": "q"
},
{
"category_id": 13,
"poly": [
215,
957,
238,
957,
238,
981,
215,
981
],
"score": 0.74,
"latex": "H"
},
{
"category_id": 13,
"poly": [
149,
956,
173,
956,
173,
981,
149,
981
],
"score": 0.63,
"latex": "W"
},
{
"category_id": 13,
"poly": [
924,
841,
1016,
841,
1016,
868,
924,
868
],
"score": 0.56,
"latex": "8{\\cdot}00\\;\\mathrm{a.m}"
},
{
"category_id": 13,
"poly": [
956,
871,
1032,
871,
1032,
898,
956,
898
],
"score": 0.43,
"latex": "20~\\mathrm{min}"
},
{
"category_id": 13,
"poly": [
1082,
781,
1112,
781,
1112,
808,
1082,
808
],
"score": 0.41,
"latex": "(l)"
},
{
"category_id": 13,
"poly": [
697,
1821,
734,
1821,
734,
1847,
697,
1847
],
"score": 0.3,
"latex": "^{1\\mathrm{~h~}}"
}
],
"page_info": {
"page_no": 0,
"height": 2200,
"width": 1700
}
}
]
\ No newline at end of file
import tempfile
import os
import shutil
from click.testing import CliRunner
from magic_pdf.tools.cli import cli
def test_cli_pdf():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "cli_test_01"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
# run
runner = CliRunner()
result = runner.invoke(
cli,
[
"-p",
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
"-o",
temp_output_dir,
],
)
# check
assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
# teardown
shutil.rmtree(temp_output_dir)
def test_cli_path():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
# run
runner = CliRunner()
result = runner.invoke(
cli, ["-p", "tests/test_tools/assets/cli/path", "-o", temp_output_dir]
)
# check
assert result.exit_code == 0
filename = "cli_test_01"
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
base_output_dir = os.path.join(temp_output_dir, "cli_test_02/auto")
filename = "cli_test_02"
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
assert os.path.exists(os.path.join(base_output_dir, "content_list.json")) is False
# teardown
shutil.rmtree(temp_output_dir)
import tempfile
import os
import shutil
from click.testing import CliRunner
from magic_pdf.tools import cli_dev
def test_cli_pdf():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "cli_test_01"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
# run
runner = CliRunner()
result = runner.invoke(
cli_dev.cli,
[
"pdf",
"-p",
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf",
"-j",
"tests/test_tools/assets/cli_dev/cli_test_01.model.json",
"-o",
temp_output_dir,
],
)
# check
assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
r = os.stat(os.path.join(base_output_dir, "content_list.json"))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
# teardown
shutil.rmtree(temp_output_dir)
def test_cli_jsonl():
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "cli_test_01"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
def mock_read_s3_path(s3path):
with open(s3path, "rb") as f:
return f.read()
cli_dev.read_s3_path = mock_read_s3_path # mock
# run
runner = CliRunner()
result = runner.invoke(
cli_dev.cli,
[
"jsonl",
"-j",
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl",
"-o",
temp_output_dir,
],
)
# check
assert result.exit_code == 0
base_output_dir = os.path.join(temp_output_dir, "cli_test_01/auto")
r = os.stat(os.path.join(base_output_dir, "content_list.json"))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
assert os.path.exists(os.path.join(base_output_dir, "images")) is True
assert os.path.isdir(os.path.join(base_output_dir, "images")) is True
# teardown
shutil.rmtree(temp_output_dir)
import tempfile
import os
import shutil
import pytest
from magic_pdf.tools.common import do_parse
@pytest.mark.parametrize("method", ["auto", "txt", "ocr"])
def test_common_do_parse(method):
# setup
unitest_dir = "/tmp/magic_pdf/unittest/tools"
filename = "fake"
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir="/tmp/magic_pdf/unittest/tools")
# run
with open("tests/test_tools/assets/common/cli_test_01.pdf", "rb") as f:
bits = f.read()
do_parse(temp_output_dir, filename, bits, [], method, f_dump_content_list=True)
# check
base_output_dir = os.path.join(temp_output_dir, f"fake/{method}")
r = os.stat(os.path.join(base_output_dir, "content_list.json"))
assert r.st_size > 5000
r = os.stat(os.path.join(base_output_dir, f"{filename}.md"))
assert r.st_size > 7000
r = os.stat(os.path.join(base_output_dir, "middle.json"))
assert r.st_size > 200000
r = os.stat(os.path.join(base_output_dir, "model.json"))
assert r.st_size > 15000
r = os.stat(os.path.join(base_output_dir, "origin.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "layout.pdf"))
assert r.st_size > 500000
r = os.stat(os.path.join(base_output_dir, "spans.pdf"))
assert r.st_size > 500000
os.path.exists(os.path.join(base_output_dir, "images"))
os.path.isdir(os.path.join(base_output_dir, "images"))
# teardown
shutil.rmtree(temp_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment