Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
a697002c
Commit
a697002c
authored
Aug 01, 2024
by
myhloli
Browse files
Merge remote-tracking branch 'origin/master'
parents
b4b2a099
40e0827e
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
936 additions
and
0 deletions
+936
-0
tests/test_tools/assets/cli/path/cli_test_02.pdf
tests/test_tools/assets/cli/path/cli_test_02.pdf
+0
-0
tests/test_tools/assets/cli/pdf/cli_test_01.pdf
tests/test_tools/assets/cli/pdf/cli_test_01.pdf
+0
-0
tests/test_tools/assets/cli_dev/cli_test_01.jsonl
tests/test_tools/assets/cli_dev/cli_test_01.jsonl
+1
-0
tests/test_tools/assets/cli_dev/cli_test_01.model.json
tests/test_tools/assets/cli_dev/cli_test_01.model.json
+638
-0
tests/test_tools/assets/cli_dev/cli_test_01.pdf
tests/test_tools/assets/cli_dev/cli_test_01.pdf
+0
-0
tests/test_tools/assets/common/cli_test_01.pdf
tests/test_tools/assets/common/cli_test_01.pdf
+0
-0
tests/test_tools/test_cli.py
tests/test_tools/test_cli.py
+125
-0
tests/test_tools/test_cli_dev.py
tests/test_tools/test_cli_dev.py
+120
-0
tests/test_tools/test_common.py
tests/test_tools/test_common.py
+52
-0
No files found.
tests/test_tools/assets/cli/path/cli_test_02.pdf
0 → 100644
View file @
a697002c
File added
tests/test_tools/assets/cli/pdf/cli_test_01.pdf
0 → 100644
View file @
a697002c
File added
tests/test_tools/assets/cli_dev/cli_test_01.jsonl
0 → 100644
View file @
a697002c
{"file_location":"tests/test_tools/assets/cli_dev/cli_test_01.pdf","doc_layout_result":[{"layout_dets":[{"category_id":1,"poly":[882.4013061523438,169.93817138671875,1552.350341796875,169.93817138671875,1552.350341796875,625.8263549804688,882.4013061523438,625.8263549804688],"score":0.999992311000824},{"category_id":1,"poly":[882.474853515625,1450.92822265625,1551.4490966796875,1450.92822265625,1551.4490966796875,1877.5712890625,882.474853515625,1877.5712890625],"score":0.9999903440475464},{"category_id":1,"poly":[881.6513061523438,626.2058715820312,1552.1400146484375,626.2058715820312,1552.1400146484375,1450.604736328125,881.6513061523438,1450.604736328125],"score":0.9999856352806091},{"category_id":1,"poly":[149.41075134277344,232.1595001220703,819.0465087890625,232.1595001220703,819.0465087890625,625.8865356445312,149.41075134277344,625.8865356445312],"score":0.99998539686203},{"category_id":1,"poly":[149.3945770263672,1215.5172119140625,817.8850708007812,1215.5172119140625,817.8850708007812,1304.873291015625,149.3945770263672,1304.873291015625],"score":0.9999765157699585},{"category_id":1,"poly":[882.6979370117188,1880.13916015625,1552.15185546875,1880.13916015625,1552.15185546875,2031.339599609375,882.6979370117188,2031.339599609375],"score":0.9999744892120361},{"category_id":1,"poly":[148.96054077148438,743.3055419921875,818.6231689453125,743.3055419921875,818.6231689453125,1074.2369384765625,148.96054077148438,1074.2369384765625],"score":0.9999669790267944},{"category_id":1,"poly":[148.8435516357422,1791.14306640625,818.6885375976562,1791.14306640625,818.6885375976562,2030.794189453125,148.8435516357422,2030.794189453125],"score":0.9999618530273438},{"category_id":0,"poly":[150.7009735107422,684.0087890625,623.5106201171875,684.0087890625,623.5106201171875,717.03662109375,150.7009735107422,717.03662109375],"score":0.9999415278434753},{"category_id":8,"poly":[146.48068237304688,1331.6737060546875,317.2640075683594,1331.6737060546875,317.2640075683594,1400.1722412109375,146.48068237304688,1400.1722412109375],"score":0.9998958110809326},{"category_id":1,"poly":[149.42420959472656,1430.8782958984375,818.9042358398438,1430.8782958984375,818.9042358398438,1672.7386474609375,149.42420959472656,1672.7386474609375],"score":0.9998599290847778},{"category_id":1,"poly":[149.18746948242188,172.10252380371094,818.5662231445312,172.10252380371094,818.5662231445312,230.4594268798828,149.18746948242188,230.4594268798828],"score":0.9997718334197998},{"category_id":0,"poly":[149.0175018310547,1732.1090087890625,702.1005859375,1732.1090087890625,702.1005859375,1763.6046142578125,149.0175018310547,1763.6046142578125],"score":0.9997085928916931},{"category_id":2,"poly":[1519.802490234375,98.59099578857422,1551.985107421875,98.59099578857422,1551.985107421875,119.48420715332031,1519.802490234375,119.48420715332031],"score":0.9995552897453308},{"category_id":8,"poly":[146.9109649658203,1100.156494140625,544.2803344726562,1100.156494140625,544.2803344726562,1184.929443359375,146.9109649658203,1184.929443359375],"score":0.9995207786560059},{"category_id":2,"poly":[148.11611938476562,99.87767791748047,318.926025390625,99.87767791748047,318.926025390625,120.70393371582031,148.11611938476562,120.70393371582031],"score":0.999351441860199},{"category_id":9,"poly":[791.7642211914062,1130.056396484375,818.6940307617188,1130.056396484375,818.6940307617188,1161.1080322265625,791.7642211914062,1161.1080322265625],"score":0.9908884763717651},{"category_id":9,"poly":[788.37060546875,1346.8450927734375,818.5010986328125,1346.8450927734375,818.5010986328125,1377.370361328125,788.37060546875,1377.370361328125],"score":0.9873985052108765},{"category_id":14,"poly":[146,1103,543,1103,543,1184,146,1184],"score":0.94,"latex":"E\\!\\left(W\\right)\\!=\\!\\frac{E\\!\\left[H^{2}\\right]}{2E\\!\\left[H\\right]}\\!=\\!\\frac{E\\!\\left[H\\right]}{2}\\!\\!\\left(1\\!+\\!\\operatorname{CV}\\!\\left(H\\right)^{2}\\right)"},{"category_id":13,"poly":[1196,354,1278,354,1278,384,1196,384],"score":0.91,"latex":"p(1-q)"},{"category_id":13,"poly":[881,415,1020,415,1020,444,881,444],"score":0.91,"latex":"(1-p)(1-q)"},{"category_id":14,"poly":[147,1333,318,1333,318,1400,147,1400],"score":0.91,"latex":"\\mathbf{CV}\\big(H\\big)\\!=\\!\\frac{\\boldsymbol{\\upsigma}_{H}}{E\\big[H\\big]}"},{"category_id":13,"poly":[1197,657,1263,657,1263,686,1197,686],"score":0.9,"latex":"(1-p)"},{"category_id":13,"poly":[213,1217,263,1217,263,1244,213,1244],"score":0.88,"latex":"E[X]"},{"category_id":13,"poly":[214,1434,245,1434,245,1459,214,1459],"score":0.87,"latex":"\\upsigma_{H}"},{"category_id":13,"poly":[324,2002,373,2002,373,2028,324,2028],"score":0.84,"latex":"30\\%"},{"category_id":13,"poly":[1209,693,1225,693,1225,717,1209,717],"score":0.83,"latex":"p"},{"category_id":13,"poly":[990,449,1007,449,1007,474,990,474],"score":0.81,"latex":"p"},{"category_id":13,"poly":[346,1277,369,1277,369,1301,346,1301],"score":0.81,"latex":"H"},{"category_id":13,"poly":[1137,661,1154,661,1154,686,1137,686],"score":0.81,"latex":"p"},{"category_id":13,"poly":[522,1432,579,1432,579,1459,522,1459],"score":0.81,"latex":"H\\left(4\\right)"},{"category_id":13,"poly":[944,540,962,540,962,565,944,565],"score":0.8,"latex":"p"},{"category_id":13,"poly":[1444,936,1461,936,1461,961,1444,961],"score":0.79,"latex":"p"},{"category_id":13,"poly":[602,1247,624,1247,624,1270,602,1270],"score":0.78,"latex":"H"},{"category_id":13,"poly":[147,1247,167,1247,167,1271,147,1271],"score":0.77,"latex":"X"},{"category_id":13,"poly":[210,1246,282,1246,282,1274,210,1274],"score":0.77,"latex":"\\operatorname{CV}(H)"},{"category_id":13,"poly":[1346,268,1361,268,1361,292,1346,292],"score":0.76,"latex":"q"},{"category_id":13,"poly":[215,957,238,957,238,981,215,981],"score":0.74,"latex":"H"},{"category_id":13,"poly":[149,956,173,956,173,981,149,981],"score":0.63,"latex":"W"},{"category_id":13,"poly":[924,841,1016,841,1016,868,924,868],"score":0.56,"latex":"8{\\cdot}00\\;\\mathrm{a.m}"},{"category_id":13,"poly":[956,871,1032,871,1032,898,956,898],"score":0.43,"latex":"20~\\mathrm{min}"},{"category_id":13,"poly":[1082,781,1112,781,1112,808,1082,808],"score":0.41,"latex":"(l)"},{"category_id":13,"poly":[697,1821,734,1821,734,1847,697,1847],"score":0.3,"latex":"^{1\\mathrm{~h~}}"}],"page_info":{"page_no":0,"height":2200,"width":1700}}]}
\ No newline at end of file
tests/test_tools/assets/cli_dev/cli_test_01.model.json
0 → 100644
View file @
a697002c
[
{
"layout_dets"
:
[
{
"category_id"
:
1
,
"poly"
:
[
882.4013061523438
,
169.93817138671875
,
1552.350341796875
,
169.93817138671875
,
1552.350341796875
,
625.8263549804688
,
882.4013061523438
,
625.8263549804688
],
"score"
:
0.999992311000824
},
{
"category_id"
:
1
,
"poly"
:
[
882.474853515625
,
1450.92822265625
,
1551.4490966796875
,
1450.92822265625
,
1551.4490966796875
,
1877.5712890625
,
882.474853515625
,
1877.5712890625
],
"score"
:
0.9999903440475464
},
{
"category_id"
:
1
,
"poly"
:
[
881.6513061523438
,
626.2058715820312
,
1552.1400146484375
,
626.2058715820312
,
1552.1400146484375
,
1450.604736328125
,
881.6513061523438
,
1450.604736328125
],
"score"
:
0.9999856352806091
},
{
"category_id"
:
1
,
"poly"
:
[
149.41075134277344
,
232.1595001220703
,
819.0465087890625
,
232.1595001220703
,
819.0465087890625
,
625.8865356445312
,
149.41075134277344
,
625.8865356445312
],
"score"
:
0.99998539686203
},
{
"category_id"
:
1
,
"poly"
:
[
149.3945770263672
,
1215.5172119140625
,
817.8850708007812
,
1215.5172119140625
,
817.8850708007812
,
1304.873291015625
,
149.3945770263672
,
1304.873291015625
],
"score"
:
0.9999765157699585
},
{
"category_id"
:
1
,
"poly"
:
[
882.6979370117188
,
1880.13916015625
,
1552.15185546875
,
1880.13916015625
,
1552.15185546875
,
2031.339599609375
,
882.6979370117188
,
2031.339599609375
],
"score"
:
0.9999744892120361
},
{
"category_id"
:
1
,
"poly"
:
[
148.96054077148438
,
743.3055419921875
,
818.6231689453125
,
743.3055419921875
,
818.6231689453125
,
1074.2369384765625
,
148.96054077148438
,
1074.2369384765625
],
"score"
:
0.9999669790267944
},
{
"category_id"
:
1
,
"poly"
:
[
148.8435516357422
,
1791.14306640625
,
818.6885375976562
,
1791.14306640625
,
818.6885375976562
,
2030.794189453125
,
148.8435516357422
,
2030.794189453125
],
"score"
:
0.9999618530273438
},
{
"category_id"
:
0
,
"poly"
:
[
150.7009735107422
,
684.0087890625
,
623.5106201171875
,
684.0087890625
,
623.5106201171875
,
717.03662109375
,
150.7009735107422
,
717.03662109375
],
"score"
:
0.9999415278434753
},
{
"category_id"
:
8
,
"poly"
:
[
146.48068237304688
,
1331.6737060546875
,
317.2640075683594
,
1331.6737060546875
,
317.2640075683594
,
1400.1722412109375
,
146.48068237304688
,
1400.1722412109375
],
"score"
:
0.9998958110809326
},
{
"category_id"
:
1
,
"poly"
:
[
149.42420959472656
,
1430.8782958984375
,
818.9042358398438
,
1430.8782958984375
,
818.9042358398438
,
1672.7386474609375
,
149.42420959472656
,
1672.7386474609375
],
"score"
:
0.9998599290847778
},
{
"category_id"
:
1
,
"poly"
:
[
149.18746948242188
,
172.10252380371094
,
818.5662231445312
,
172.10252380371094
,
818.5662231445312
,
230.4594268798828
,
149.18746948242188
,
230.4594268798828
],
"score"
:
0.9997718334197998
},
{
"category_id"
:
0
,
"poly"
:
[
149.0175018310547
,
1732.1090087890625
,
702.1005859375
,
1732.1090087890625
,
702.1005859375
,
1763.6046142578125
,
149.0175018310547
,
1763.6046142578125
],
"score"
:
0.9997085928916931
},
{
"category_id"
:
2
,
"poly"
:
[
1519.802490234375
,
98.59099578857422
,
1551.985107421875
,
98.59099578857422
,
1551.985107421875
,
119.48420715332031
,
1519.802490234375
,
119.48420715332031
],
"score"
:
0.9995552897453308
},
{
"category_id"
:
8
,
"poly"
:
[
146.9109649658203
,
1100.156494140625
,
544.2803344726562
,
1100.156494140625
,
544.2803344726562
,
1184.929443359375
,
146.9109649658203
,
1184.929443359375
],
"score"
:
0.9995207786560059
},
{
"category_id"
:
2
,
"poly"
:
[
148.11611938476562
,
99.87767791748047
,
318.926025390625
,
99.87767791748047
,
318.926025390625
,
120.70393371582031
,
148.11611938476562
,
120.70393371582031
],
"score"
:
0.999351441860199
},
{
"category_id"
:
9
,
"poly"
:
[
791.7642211914062
,
1130.056396484375
,
818.6940307617188
,
1130.056396484375
,
818.6940307617188
,
1161.1080322265625
,
791.7642211914062
,
1161.1080322265625
],
"score"
:
0.9908884763717651
},
{
"category_id"
:
9
,
"poly"
:
[
788.37060546875
,
1346.8450927734375
,
818.5010986328125
,
1346.8450927734375
,
818.5010986328125
,
1377.370361328125
,
788.37060546875
,
1377.370361328125
],
"score"
:
0.9873985052108765
},
{
"category_id"
:
14
,
"poly"
:
[
146
,
1103
,
543
,
1103
,
543
,
1184
,
146
,
1184
],
"score"
:
0.94
,
"latex"
:
"E
\\
!
\\
left(W
\\
right)
\\
!=
\\
!
\\
frac{E
\\
!
\\
left[H^{2}
\\
right]}{2E
\\
!
\\
left[H
\\
right]}
\\
!=
\\
!
\\
frac{E
\\
!
\\
left[H
\\
right]}{2}
\\
!
\\
!
\\
left(1
\\
!+
\\
!
\\
operatorname{CV}
\\
!
\\
left(H
\\
right)^{2}
\\
right)"
},
{
"category_id"
:
13
,
"poly"
:
[
1196
,
354
,
1278
,
354
,
1278
,
384
,
1196
,
384
],
"score"
:
0.91
,
"latex"
:
"p(1-q)"
},
{
"category_id"
:
13
,
"poly"
:
[
881
,
415
,
1020
,
415
,
1020
,
444
,
881
,
444
],
"score"
:
0.91
,
"latex"
:
"(1-p)(1-q)"
},
{
"category_id"
:
14
,
"poly"
:
[
147
,
1333
,
318
,
1333
,
318
,
1400
,
147
,
1400
],
"score"
:
0.91
,
"latex"
:
"
\\
mathbf{CV}
\\
big(H
\\
big)
\\
!=
\\
!
\\
frac{
\\
boldsymbol{
\\
upsigma}_{H}}{E
\\
big[H
\\
big]}"
},
{
"category_id"
:
13
,
"poly"
:
[
1197
,
657
,
1263
,
657
,
1263
,
686
,
1197
,
686
],
"score"
:
0.9
,
"latex"
:
"(1-p)"
},
{
"category_id"
:
13
,
"poly"
:
[
213
,
1217
,
263
,
1217
,
263
,
1244
,
213
,
1244
],
"score"
:
0.88
,
"latex"
:
"E[X]"
},
{
"category_id"
:
13
,
"poly"
:
[
214
,
1434
,
245
,
1434
,
245
,
1459
,
214
,
1459
],
"score"
:
0.87
,
"latex"
:
"
\\
upsigma_{H}"
},
{
"category_id"
:
13
,
"poly"
:
[
324
,
2002
,
373
,
2002
,
373
,
2028
,
324
,
2028
],
"score"
:
0.84
,
"latex"
:
"30
\\
%"
},
{
"category_id"
:
13
,
"poly"
:
[
1209
,
693
,
1225
,
693
,
1225
,
717
,
1209
,
717
],
"score"
:
0.83
,
"latex"
:
"p"
},
{
"category_id"
:
13
,
"poly"
:
[
990
,
449
,
1007
,
449
,
1007
,
474
,
990
,
474
],
"score"
:
0.81
,
"latex"
:
"p"
},
{
"category_id"
:
13
,
"poly"
:
[
346
,
1277
,
369
,
1277
,
369
,
1301
,
346
,
1301
],
"score"
:
0.81
,
"latex"
:
"H"
},
{
"category_id"
:
13
,
"poly"
:
[
1137
,
661
,
1154
,
661
,
1154
,
686
,
1137
,
686
],
"score"
:
0.81
,
"latex"
:
"p"
},
{
"category_id"
:
13
,
"poly"
:
[
522
,
1432
,
579
,
1432
,
579
,
1459
,
522
,
1459
],
"score"
:
0.81
,
"latex"
:
"H
\\
left(4
\\
right)"
},
{
"category_id"
:
13
,
"poly"
:
[
944
,
540
,
962
,
540
,
962
,
565
,
944
,
565
],
"score"
:
0.8
,
"latex"
:
"p"
},
{
"category_id"
:
13
,
"poly"
:
[
1444
,
936
,
1461
,
936
,
1461
,
961
,
1444
,
961
],
"score"
:
0.79
,
"latex"
:
"p"
},
{
"category_id"
:
13
,
"poly"
:
[
602
,
1247
,
624
,
1247
,
624
,
1270
,
602
,
1270
],
"score"
:
0.78
,
"latex"
:
"H"
},
{
"category_id"
:
13
,
"poly"
:
[
147
,
1247
,
167
,
1247
,
167
,
1271
,
147
,
1271
],
"score"
:
0.77
,
"latex"
:
"X"
},
{
"category_id"
:
13
,
"poly"
:
[
210
,
1246
,
282
,
1246
,
282
,
1274
,
210
,
1274
],
"score"
:
0.77
,
"latex"
:
"
\\
operatorname{CV}(H)"
},
{
"category_id"
:
13
,
"poly"
:
[
1346
,
268
,
1361
,
268
,
1361
,
292
,
1346
,
292
],
"score"
:
0.76
,
"latex"
:
"q"
},
{
"category_id"
:
13
,
"poly"
:
[
215
,
957
,
238
,
957
,
238
,
981
,
215
,
981
],
"score"
:
0.74
,
"latex"
:
"H"
},
{
"category_id"
:
13
,
"poly"
:
[
149
,
956
,
173
,
956
,
173
,
981
,
149
,
981
],
"score"
:
0.63
,
"latex"
:
"W"
},
{
"category_id"
:
13
,
"poly"
:
[
924
,
841
,
1016
,
841
,
1016
,
868
,
924
,
868
],
"score"
:
0.56
,
"latex"
:
"8{
\\
cdot}00
\\
;
\\
mathrm{a.m}"
},
{
"category_id"
:
13
,
"poly"
:
[
956
,
871
,
1032
,
871
,
1032
,
898
,
956
,
898
],
"score"
:
0.43
,
"latex"
:
"20~
\\
mathrm{min}"
},
{
"category_id"
:
13
,
"poly"
:
[
1082
,
781
,
1112
,
781
,
1112
,
808
,
1082
,
808
],
"score"
:
0.41
,
"latex"
:
"(l)"
},
{
"category_id"
:
13
,
"poly"
:
[
697
,
1821
,
734
,
1821
,
734
,
1847
,
697
,
1847
],
"score"
:
0.3
,
"latex"
:
"^{1
\\
mathrm{~h~}}"
}
],
"page_info"
:
{
"page_no"
:
0
,
"height"
:
2200
,
"width"
:
1700
}
}
]
\ No newline at end of file
tests/test_tools/assets/cli_dev/cli_test_01.pdf
0 → 100644
View file @
a697002c
File added
tests/test_tools/assets/common/cli_test_01.pdf
0 → 100644
View file @
a697002c
File added
tests/test_tools/test_cli.py
0 → 100644
View file @
a697002c
import
tempfile
import
os
import
shutil
from
click.testing
import
CliRunner
from
magic_pdf.tools.cli
import
cli
def
test_cli_pdf
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli
,
[
"-p"
,
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
is
False
# teardown
shutil
.
rmtree
(
temp_output_dir
)
def
test_cli_path
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli
,
[
"-p"
,
"tests/test_tools/assets/cli/path"
,
"-o"
,
temp_output_dir
]
)
# check
assert
result
.
exit_code
==
0
filename
=
"cli_test_01"
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
is
False
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_02/auto"
)
filename
=
"cli_test_02"
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
is
False
# teardown
shutil
.
rmtree
(
temp_output_dir
)
tests/test_tools/test_cli_dev.py
0 → 100644
View file @
a697002c
import
tempfile
import
os
import
shutil
from
click.testing
import
CliRunner
from
magic_pdf.tools
import
cli_dev
def
test_cli_pdf
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"pdf"
,
"-p"
,
"tests/test_tools/assets/cli/pdf/cli_test_01.pdf"
,
"-j"
,
"tests/test_tools/assets/cli_dev/cli_test_01.model.json"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
def
test_cli_jsonl
():
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"cli_test_01"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
def
mock_read_s3_path
(
s3path
):
with
open
(
s3path
,
"rb"
)
as
f
:
return
f
.
read
()
cli_dev
.
read_s3_path
=
mock_read_s3_path
# mock
# run
runner
=
CliRunner
()
result
=
runner
.
invoke
(
cli_dev
.
cli
,
[
"jsonl"
,
"-j"
,
"tests/test_tools/assets/cli_dev/cli_test_01.jsonl"
,
"-o"
,
temp_output_dir
,
],
)
# check
assert
result
.
exit_code
==
0
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
"cli_test_01/auto"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
assert
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
assert
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
is
True
# teardown
shutil
.
rmtree
(
temp_output_dir
)
tests/test_tools/test_common.py
0 → 100644
View file @
a697002c
import
tempfile
import
os
import
shutil
import
pytest
from
magic_pdf.tools.common
import
do_parse
@
pytest
.
mark
.
parametrize
(
"method"
,
[
"auto"
,
"txt"
,
"ocr"
])
def
test_common_do_parse
(
method
):
# setup
unitest_dir
=
"/tmp/magic_pdf/unittest/tools"
filename
=
"fake"
os
.
makedirs
(
unitest_dir
,
exist_ok
=
True
)
temp_output_dir
=
tempfile
.
mkdtemp
(
dir
=
"/tmp/magic_pdf/unittest/tools"
)
# run
with
open
(
"tests/test_tools/assets/common/cli_test_01.pdf"
,
"rb"
)
as
f
:
bits
=
f
.
read
()
do_parse
(
temp_output_dir
,
filename
,
bits
,
[],
method
,
f_dump_content_list
=
True
)
# check
base_output_dir
=
os
.
path
.
join
(
temp_output_dir
,
f
"fake/
{
method
}
"
)
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"content_list.json"
))
assert
r
.
st_size
>
5000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
f
"
{
filename
}
.md"
))
assert
r
.
st_size
>
7000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"middle.json"
))
assert
r
.
st_size
>
200000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"model.json"
))
assert
r
.
st_size
>
15000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"origin.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"layout.pdf"
))
assert
r
.
st_size
>
500000
r
=
os
.
stat
(
os
.
path
.
join
(
base_output_dir
,
"spans.pdf"
))
assert
r
.
st_size
>
500000
os
.
path
.
exists
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
os
.
path
.
isdir
(
os
.
path
.
join
(
base_output_dir
,
"images"
))
# teardown
shutil
.
rmtree
(
temp_output_dir
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment