Unverified Commit 85a4750d authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #3026 from Sidney233/dev

Dev
parents 206ed770 a7e75dc0
[
{
"layout_dets": [
{
"category_id": 1,
"poly": [
578.2055053710938,
672.8831787109375,
1579.973388671875,
672.8831787109375,
1579.973388671875,
1034.681640625,
578.2055053710938,
1034.681640625
],
"score": 0.9999963045120239
},
{
"category_id": 1,
"poly": [
583.6041259765625,
1067.1112060546875,
1579.822265625,
1067.1112060546875,
1579.822265625,
1537.1324462890625,
583.6041259765625,
1537.1324462890625
],
"score": 0.9999961853027344
},
{
"category_id": 1,
"poly": [
585.4341430664062,
1568.220703125,
1578.5487060546875,
1568.220703125,
1578.5487060546875,
1931.516845703125,
585.4341430664062,
1931.516845703125
],
"score": 0.9999949336051941
},
{
"category_id": 1,
"poly": [
578.491455078125,
532.0020141601562,
1577.96337890625,
532.0020141601562,
1577.96337890625,
641.0128784179688,
578.491455078125,
641.0128784179688
],
"score": 0.999992847442627
},
{
"category_id": 1,
"poly": [
66.43791961669922,
1776.6951904296875,
530.4810180664062,
1776.6951904296875,
530.4810180664062,
1883.127685546875,
66.43791961669922,
1883.127685546875
],
"score": 0.9999925494194031
},
{
"category_id": 3,
"poly": [
70.23656463623047,
818.9393920898438,
517.8253784179688,
818.9393920898438,
517.8253784179688,
1076.5823974609375,
70.23656463623047,
1076.5823974609375
],
"score": 0.9999912977218628
},
{
"category_id": 1,
"poly": [
64.99957275390625,
651.9596557617188,
436.5134582519531,
651.9596557617188,
436.5134582519531,
723.5758056640625,
64.99957275390625,
723.5758056640625
],
"score": 0.9999804496765137
},
{
"category_id": 0,
"poly": [
556.2775268554688,
270.2123107910156,
1577.8211669921875,
270.2123107910156,
1577.8211669921875,
408.9685974121094,
556.2775268554688,
408.9685974121094
],
"score": 0.9999696016311646
},
{
"category_id": 1,
"poly": [
67.8562240600586,
1342.2239990234375,
530.5654296875,
1342.2239990234375,
530.5654296875,
1447.843017578125,
67.8562240600586,
1447.843017578125
],
"score": 0.9999648928642273
},
{
"category_id": 1,
"poly": [
65.74958801269531,
1631.3671875,
530.32861328125,
1631.3671875,
530.32861328125,
1772.413818359375,
65.74958801269531,
1772.413818359375
],
"score": 0.9999628067016602
},
{
"category_id": 1,
"poly": [
588.5570068359375,
2068.54931640625,
1525.3253173828125,
2068.54931640625,
1525.3253173828125,
2103.89013671875,
588.5570068359375,
2103.89013671875
],
"score": 0.9999607801437378
},
{
"category_id": 1,
"poly": [
586.5548706054688,
1963.105712890625,
1556.578125,
1963.105712890625,
1556.578125,
2034.8116455078125,
586.5548706054688,
2034.8116455078125
],
"score": 0.9999469518661499
},
{
"category_id": 5,
"poly": [
59.96487045288086,
1110.6282958984375,
529.9209594726562,
1110.6282958984375,
529.9209594726562,
1225.2921142578125,
59.96487045288086,
1225.2921142578125
],
"score": 0.999945878982544
},
{
"category_id": 2,
"poly": [
70.25292205810547,
103.42201232910156,
420.4892578125,
103.42201232910156,
420.4892578125,
223.39370727539062,
70.25292205810547,
223.39370727539062
],
"score": 0.9999405145645142
},
{
"category_id": 2,
"poly": [
1081.0203857421875,
2244.87890625,
1554.669189453125,
2244.87890625,
1554.669189453125,
2275.28662109375,
1081.0203857421875,
2275.28662109375
],
"score": 0.9999217987060547
},
{
"category_id": 1,
"poly": [
68.85404968261719,
345.9093017578125,
307.9080810546875,
345.9093017578125,
307.9080810546875,
409.0098876953125,
68.85404968261719,
409.0098876953125
],
"score": 0.9999183416366577
},
{
"category_id": 0,
"poly": [
65.58759307861328,
1295.9366455078125,
180.4149932861328,
1295.9366455078125,
180.4149932861328,
1328.867919921875,
65.58759307861328,
1328.867919921875
],
"score": 0.9998926520347595
},
{
"category_id": 2,
"poly": [
1245.0789794921875,
108.83513641357422,
1576.3131103515625,
108.83513641357422,
1576.3131103515625,
219.29042053222656,
1245.0789794921875,
219.29042053222656
],
"score": 0.9995975494384766
},
{
"category_id": 1,
"poly": [
65.75041961669922,
483.5210266113281,
428.6028137207031,
483.5210266113281,
428.6028137207031,
586.8894653320312,
65.75041961669922,
586.8894653320312
],
"score": 0.9993270635604858
},
{
"category_id": 0,
"poly": [
65.02926635742188,
445.02288818359375,
208.3317108154297,
445.02288818359375,
208.3317108154297,
476.65252685546875,
65.02926635742188,
476.65252685546875
],
"score": 0.9992279410362244
},
{
"category_id": 0,
"poly": [
556.96630859375,
453.08447265625,
673.0485229492188,
453.08447265625,
673.0485229492188,
490.60455322265625,
556.96630859375,
490.60455322265625
],
"score": 0.9949817657470703
},
{
"category_id": 1,
"poly": [
66.26518249511719,
1524.234130859375,
530.2540283203125,
1524.234130859375,
530.2540283203125,
1627.5291748046875,
66.26518249511719,
1627.5291748046875
],
"score": 0.9919581413269043
},
{
"category_id": 7,
"poly": [
62.5564079284668,
1227.41943359375,
380.10693359375,
1227.41943359375,
380.10693359375,
1252.8614501953125,
62.5564079284668,
1252.8614501953125
],
"score": 0.9918426275253296
},
{
"category_id": 1,
"poly": [
66.80464935302734,
1451.4775390625,
527.3795166015625,
1451.4775390625,
527.3795166015625,
1519.5836181640625,
66.80464935302734,
1519.5836181640625
],
"score": 0.9883899688720703
},
{
"category_id": 0,
"poly": [
65.36080932617188,
605.3754272460938,
181.24375915527344,
605.3754272460938,
181.24375915527344,
637.0076904296875,
65.36080932617188,
637.0076904296875
],
"score": 0.9870840311050415
},
{
"category_id": 0,
"poly": [
178.82904052734375,
264.6627197265625,
396.52825927734375,
264.6627197265625,
396.52825927734375,
315.41900634765625,
178.82904052734375,
315.41900634765625
],
"score": 0.9779323935508728
},
{
"category_id": 4,
"poly": [
66.15127563476562,
767.24658203125,
181.25694274902344,
767.24658203125,
181.25694274902344,
799.7832641601562,
66.15127563476562,
799.7832641601562
],
"score": 0.8932801485061646
},
{
"category_id": 13,
"poly": [
590,
747,
688,
747,
688,
778,
590,
778
],
"score": 0.91,
"latex": "+24.4\\%"
},
{
"category_id": 13,
"poly": [
1433,
855,
1492,
855,
1492,
886,
1433,
886
],
"score": 0.86,
"latex": "30\\%"
},
{
"category_id": 13,
"poly": [
238,
689,
264,
689,
264,
717,
238,
717
],
"score": 0.34,
"latex": "@"
},
{
"category_id": 13,
"poly": [
702,
1002,
722,
1002,
722,
1026,
702,
1026
],
"score": 0.33,
"latex": "^+"
},
{
"category_id": 13,
"poly": [
177,
1154,
223,
1154,
223,
1185,
177,
1185
],
"score": 0.28,
"latex": "(\\%)"
}
],
"page_info": {
"page_no": 0,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 2,
"poly": [
88.00849151611328,
31.891826629638672,
300.7432861328125,
31.891826629638672,
300.7432861328125,
113.5999755859375,
88.00849151611328,
113.5999755859375
],
"score": 0.9999986886978149
},
{
"category_id": 2,
"poly": [
771.0192260742188,
2213.479248046875,
827.4273681640625,
2213.479248046875,
827.4273681640625,
2239.40185546875,
771.0192260742188,
2239.40185546875
],
"score": 0.9999963641166687
},
{
"category_id": 7,
"poly": [
544.2962646484375,
488.5493469238281,
988.3958129882812,
488.5493469238281,
988.3958129882812,
541.0634155273438,
544.2962646484375,
541.0634155273438
],
"score": 0.9999918341636658
},
{
"category_id": 2,
"poly": [
1082.88232421875,
82.37471771240234,
1519.4150390625,
82.37471771240234,
1519.4150390625,
114.9271011352539,
1082.88232421875,
114.9271011352539
],
"score": 0.9999632835388184
},
{
"category_id": 2,
"poly": [
1009.1597900390625,
2210.9462890625,
1535.9239501953125,
2210.9462890625,
1535.9239501953125,
2241.830322265625,
1009.1597900390625,
2241.830322265625
],
"score": 0.9999324679374695
},
{
"category_id": 5,
"poly": [
537.349365234375,
156.8784637451172,
1584.9866943359375,
156.8784637451172,
1584.9866943359375,
485.3042907714844,
537.349365234375,
485.3042907714844
],
"score": 0.9985955953598022
},
{
"category_id": 7,
"poly": [
62.69784927368164,
443.4034118652344,
249.9097137451172,
443.4034118652344,
249.9097137451172,
467.4612731933594,
62.69784927368164,
467.4612731933594
],
"score": 0.9873980283737183
},
{
"category_id": 5,
"poly": [
61.374210357666016,
138.51153564453125,
528.30517578125,
138.51153564453125,
528.30517578125,
443.5376281738281,
61.374210357666016,
443.5376281738281
],
"score": 0.9232220649719238
},
{
"category_id": 6,
"poly": [
548.1119384765625,
148.7312774658203,
797.3070678710938,
148.7312774658203,
797.3070678710938,
180.74609375,
548.1119384765625,
180.74609375
],
"score": 0.6074804663658142
},
{
"category_id": 13,
"poly": [
864,
455,
922,
455,
922,
482,
864,
482
],
"score": 0.74,
"latex": "6.0\\%"
},
{
"category_id": 13,
"poly": [
850,
418,
922,
418,
922,
445,
850,
445
],
"score": 0.64,
"latex": "35.3\\%"
},
{
"category_id": 13,
"poly": [
1501,
270,
1571,
270,
1571,
298,
1501,
298
],
"score": 0.54,
"latex": "13.8\\%"
},
{
"category_id": 13,
"poly": [
1013,
454,
1083,
454,
1083,
482,
1013,
482
],
"score": 0.52,
"latex": "15.0\\%"
},
{
"category_id": 13,
"poly": [
1012,
417,
1083,
417,
1083,
444,
1012,
444
],
"score": 0.52,
"latex": "33.7\\%"
},
{
"category_id": 13,
"poly": [
689,
456,
725,
456,
725,
482,
689,
482
],
"score": 0.48,
"latex": "(\\%)"
},
{
"category_id": 13,
"poly": [
850,
344,
922,
344,
922,
372,
850,
372
],
"score": 0.4,
"latex": "83.8\\%"
},
{
"category_id": 13,
"poly": [
863,
270,
922,
270,
922,
298,
863,
298
],
"score": 0.4,
"latex": "4.5\\%"
},
{
"category_id": 13,
"poly": [
1334,
270,
1406,
270,
1406,
298,
1334,
298
],
"score": 0.35,
"latex": "37.2\\%"
},
{
"category_id": 13,
"poly": [
618,
419,
656,
419,
656,
446,
618,
446
],
"score": 0.35,
"latex": "(\\%)"
}
],
"page_info": {
"page_no": 1,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 2,
"poly": [
87.9037094116211,
31.59800148010254,
300.9930419921875,
31.59800148010254,
300.9930419921875,
113.4053955078125,
87.9037094116211,
113.4053955078125
],
"score": 0.9999939799308777
},
{
"category_id": 2,
"poly": [
1008.992919921875,
2209.248779296875,
1534.9334716796875,
2209.248779296875,
1534.9334716796875,
2242.77294921875,
1008.992919921875,
2242.77294921875
],
"score": 0.9999377131462097
},
{
"category_id": 2,
"poly": [
770.6600341796875,
2212.857666015625,
827.4126586914062,
2212.857666015625,
827.4126586914062,
2239.77197265625,
770.6600341796875,
2239.77197265625
],
"score": 0.9998395442962646
},
{
"category_id": 2,
"poly": [
1082.096923828125,
82.25012969970703,
1518.9267578125,
82.25012969970703,
1518.9267578125,
114.52576446533203,
1082.096923828125,
114.52576446533203
],
"score": 0.9996457099914551
},
{
"category_id": 7,
"poly": [
95.39900970458984,
1846.6380615234375,
564.4166870117188,
1846.6380615234375,
564.4166870117188,
1899.209716796875,
95.39900970458984,
1899.209716796875
],
"score": 0.9908766746520996
},
{
"category_id": 6,
"poly": [
95.4662094116211,
173.42832946777344,
470.21905517578125,
173.42832946777344,
470.21905517578125,
217.74632263183594,
95.4662094116211,
217.74632263183594
],
"score": 0.9437939524650574
},
{
"category_id": 5,
"poly": [
854.1142578125,
1043.93603515625,
1592.0174560546875,
1043.93603515625,
1592.0174560546875,
1846.16552734375,
854.1142578125,
1846.16552734375
],
"score": 0.8844046592712402
},
{
"category_id": 5,
"poly": [
92.02946472167969,
1331.8909912109375,
814.2915649414062,
1331.8909912109375,
814.2915649414062,
1842.6195068359375,
92.02946472167969,
1842.6195068359375
],
"score": 0.8743430972099304
},
{
"category_id": 5,
"poly": [
851.83984375,
224.99559020996094,
1592.4068603515625,
224.99559020996094,
1592.4068603515625,
1018.7105712890625,
851.83984375,
1018.7105712890625
],
"score": 0.8650150299072266
},
{
"category_id": 5,
"poly": [
91.79800415039062,
224.10838317871094,
816.58154296875,
224.10838317871094,
816.58154296875,
1248.422607421875,
91.79800415039062,
1248.422607421875
],
"score": 0.8604844808578491
},
{
"category_id": 5,
"poly": [
85.19661712646484,
220.71524047851562,
1602.3074951171875,
220.71524047851562,
1602.3074951171875,
1844.488525390625,
85.19661712646484,
1844.488525390625
],
"score": 0.6638449430465698
},
{
"category_id": 13,
"poly": [
737,
704,
804,
704,
804,
730,
737,
730
],
"score": 0.56,
"latex": "\\pmb{26.5\\%}"
},
{
"category_id": 13,
"poly": [
738,
673,
804,
673,
804,
699,
738,
699
],
"score": 0.48,
"latex": "\\pmb{16.2\\%}"
},
{
"category_id": 13,
"poly": [
736,
767,
805,
767,
805,
795,
736,
795
],
"score": 0.48,
"latex": "\\mathbf{\\lambda_{23.7\\%}}"
},
{
"category_id": 13,
"poly": [
231,
611,
268,
611,
268,
638,
231,
638
],
"score": 0.47,
"latex": "(\\%)"
},
{
"category_id": 13,
"poly": [
749,
736,
804,
736,
804,
763,
749,
763
],
"score": 0.41,
"latex": "\\pmb{9.2\\%}"
},
{
"category_id": 13,
"poly": [
737,
641,
804,
641,
804,
668,
737,
668
],
"score": 0.41,
"latex": "{\\bf38.0\\%}"
},
{
"category_id": 13,
"poly": [
748,
577,
805,
577,
805,
606,
748,
606
],
"score": 0.35,
"latex": "0.1\\%"
},
{
"category_id": 13,
"poly": [
187,
800,
222,
800,
222,
827,
187,
827
],
"score": 0.32,
"latex": "(\\%)"
},
{
"category_id": 13,
"poly": [
738,
830,
805,
830,
805,
857,
738,
857
],
"score": 0.28,
"latex": "\\mathbf{13.8\\%}"
},
{
"category_id": 13,
"poly": [
737,
862,
805,
862,
805,
889,
737,
889
],
"score": 0.27,
"latex": "\\mathbf{31.9\\%}"
},
{
"category_id": 13,
"poly": [
736,
955,
804,
955,
804,
983,
736,
983
],
"score": 0.26,
"latex": "\\pmb{65.3\\%}"
}
],
"page_info": {
"page_no": 2,
"height": 2339,
"width": 1654
}
},
{
"layout_dets": [
{
"category_id": 2,
"poly": [
86.3010025024414,
32.05937194824219,
303.65325927734375,
32.05937194824219,
303.65325927734375,
114.77494049072266,
86.3010025024414,
114.77494049072266
],
"score": 0.9999954700469971
},
{
"category_id": 1,
"poly": [
108.4952392578125,
590.2026977539062,
1536.75146484375,
590.2026977539062,
1536.75146484375,
688.4915771484375,
108.4952392578125,
688.4915771484375
],
"score": 0.9999932646751404
},
{
"category_id": 0,
"poly": [
95.94864654541016,
1205.4134521484375,
252.92477416992188,
1205.4134521484375,
252.92477416992188,
1246.0015869140625,
95.94864654541016,
1246.0015869140625
],
"score": 0.999992847442627
},
{
"category_id": 1,
"poly": [
106.48407745361328,
338.27471923828125,
1568.86328125,
338.27471923828125,
1568.86328125,
437.84783935546875,
106.48407745361328,
437.84783935546875
],
"score": 0.9999897480010986
},
{
"category_id": 2,
"poly": [
767.6918334960938,
2212.269287109375,
830.787353515625,
2212.269287109375,
830.787353515625,
2239.28515625,
767.6918334960938,
2239.28515625
],
"score": 0.9999850988388062
},
{
"category_id": 0,
"poly": [
96.18482208251953,
508.36334228515625,
291.4427490234375,
508.36334228515625,
291.4427490234375,
549.4661865234375,
96.18482208251953,
549.4661865234375
],
"score": 0.9999837875366211
},
{
"category_id": 2,
"poly": [
1082.2672119140625,
81.18732452392578,
1520.2149658203125,
81.18732452392578,
1520.2149658203125,
116.55751037597656,
1082.2672119140625,
116.55751037597656
],
"score": 0.9999496340751648
},
{
"category_id": 0,
"poly": [
96.45167541503906,
157.92835998535156,
319.21392822265625,
157.92835998535156,
319.21392822265625,
213.8436279296875,
96.45167541503906,
213.8436279296875
],
"score": 0.9999274015426636
},
{
"category_id": 0,
"poly": [
96.99238586425781,
257.6522216796875,
483.6472473144531,
257.6522216796875,
483.6472473144531,
301.53717041015625,
96.99238586425781,
301.53717041015625
],
"score": 0.9999104738235474
},
{
"category_id": 2,
"poly": [
1008.8760986328125,
2208.609375,
1536.0474853515625,
2208.609375,
1536.0474853515625,
2243.414306640625,
1008.8760986328125,
2243.414306640625
],
"score": 0.9998928308486938
},
{
"category_id": 1,
"poly": [
108.46533203125,
1288.0927734375,
1546.7518310546875,
1288.0927734375,
1546.7518310546875,
1383.8438720703125,
108.46533203125,
1383.8438720703125
],
"score": 0.9997898936271667
},
{
"category_id": 1,
"poly": [
107.81462860107422,
1678.24609375,
1227.880615234375,
1678.24609375,
1227.880615234375,
1711.37255859375,
107.81462860107422,
1711.37255859375
],
"score": 0.99957275390625
},
{
"category_id": 5,
"poly": [
109.75360107421875,
810.0169677734375,
1579.9549560546875,
810.0169677734375,
1579.9549560546875,
1171.6383056640625,
109.75360107421875,
1171.6383056640625
],
"score": 0.9994542598724365
},
{
"category_id": 1,
"poly": [
106.46218872070312,
1548.299072265625,
1540.3388671875,
1548.299072265625,
1540.3388671875,
1676.67919921875,
106.46218872070312,
1676.67919921875
],
"score": 0.9886452555656433
},
{
"category_id": 1,
"poly": [
107.52558898925781,
1386.4000244140625,
1540.886962890625,
1386.4000244140625,
1540.886962890625,
1447.8128662109375,
107.52558898925781,
1447.8128662109375
],
"score": 0.9709398150444031
},
{
"category_id": 1,
"poly": [
107.66414642333984,
1451.8369140625,
1537.99169921875,
1451.8369140625,
1537.99169921875,
1546.690185546875,
107.66414642333984,
1546.690185546875
],
"score": 0.9590120315551758
},
{
"category_id": 6,
"poly": [
95.90371704101562,
728.2855224609375,
328.1967468261719,
728.2855224609375,
328.1967468261719,
768.121826171875,
95.90371704101562,
768.121826171875
],
"score": 0.6999977827072144
},
{
"category_id": 1,
"poly": [
106.67481994628906,
1371.857421875,
1544.84814453125,
1371.857421875,
1544.84814453125,
1678.67236328125,
106.67481994628906,
1678.67236328125
],
"score": 0.5645973086357117
},
{
"category_id": 0,
"poly": [
95.94171142578125,
728.264404296875,
328.1947937011719,
728.264404296875,
328.1947937011719,
768.1663818359375,
95.94171142578125,
768.1663818359375
],
"score": 0.30702608823776245
},
{
"category_id": 13,
"poly": [
1247,
887,
1353,
887,
1353,
914,
1247,
914
],
"score": 0.91,
"latex": "5\\%{\\sim}20\\%"
},
{
"category_id": 13,
"poly": [
1181,
923,
1290,
923,
1290,
950,
1181,
950
],
"score": 0.9,
"latex": "-5\\%{+}5\\%"
},
{
"category_id": 13,
"poly": [
1416,
1047,
1469,
1047,
1469,
1077,
1416,
1077
],
"score": 0.87,
"latex": "10\\%"
},
{
"category_id": 13,
"poly": [
1254,
963,
1296,
963,
1296,
991,
1254,
991
],
"score": 0.86,
"latex": "5\\%"
},
{
"category_id": 13,
"poly": [
1373,
1003,
1428,
1003,
1428,
1032,
1373,
1032
],
"score": 0.86,
"latex": "10\\%"
},
{
"category_id": 13,
"poly": [
1332,
1047,
1388,
1047,
1388,
1076,
1332,
1076
],
"score": 0.86,
"latex": "\\cdot10\\%"
},
{
"category_id": 13,
"poly": [
1373,
1112,
1428,
1112,
1428,
1141,
1373,
1141
],
"score": 0.85,
"latex": "10\\%"
},
{
"category_id": 13,
"poly": [
1248,
854,
1302,
854,
1302,
880,
1248,
880
],
"score": 0.85,
"latex": "z0\\%"
}
],
"page_info": {
"page_no": 3,
"height": 2339,
"width": 1654
}
}
]
\ No newline at end of file
"""
bench
"""
import os
import shutil
import json
from lib import calculate_score
import pytest
from conf import conf
code_path = os.environ.get('GITHUB_WORKSPACE')
pdf_dev_path = conf.conf["pdf_dev_path"]
pdf_res_path = conf.conf["pdf_res_path"]
class TestBench():
"""
test bench
"""
def test_ci_ben(self):
"""
ci benchmark
"""
fr = open(os.path.join(pdf_dev_path, "result.json"), "r", encoding="utf-8")
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
last_simscore = last_score["average_sim_score"]
last_editdistance = last_score["average_edit_distance"]
last_bleu = last_score["average_bleu_score"]
os.system(f"python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}")
now_score = get_score()
print ("now_score:", now_score)
if not os.path.exists(os.path.join(pdf_dev_path, "ci")):
os.makedirs(os.path.join(pdf_dev_path, "ci"), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, "ci", "result.json"), "w+", encoding="utf-8")
fw.write(json.dumps(now_score) + "\n")
now_simscore = now_score["average_sim_score"]
now_editdistance = now_score["average_edit_distance"]
now_bleu = now_score["average_bleu_score"]
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
def get_score():
"""
get score
"""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, "result.json"))
score.calculate_similarity_total("mineru", pdf_dev_path)
res = score.summary_scores()
return res
import json
import os
import shutil
from conf import conf
from lib import calculate_score
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
class TestCliCuda:
"""test cli cuda."""
def test_pdf_sdk_cuda(self):
"""pdf sdk cuda."""
clean_magicpdf(pdf_res_path)
pdf_to_markdown()
fr = open(os.path.join(pdf_dev_path, 'result.json'), 'r', encoding='utf-8')
lines = fr.readlines()
last_line = lines[-1].strip()
last_score = json.loads(last_line)
last_simscore = last_score['average_sim_score']
last_editdistance = last_score['average_edit_distance']
last_bleu = last_score['average_bleu_score']
os.system(f'python tests/test_cli/lib/pre_clean.py --tool_name mineru --download_dir {pdf_dev_path}')
now_score = get_score()
print ('now_score:', now_score)
if not os.path.exists(os.path.join(pdf_dev_path, 'ci')):
os.makedirs(os.path.join(pdf_dev_path, 'ci'), exist_ok=True)
fw = open(os.path.join(pdf_dev_path, 'ci', 'result.json'), 'w+', encoding='utf-8')
fw.write(json.dumps(now_score) + '\n')
now_simscore = now_score['average_sim_score']
now_editdistance = now_score['average_edit_distance']
now_bleu = now_score['average_bleu_score']
assert last_simscore <= now_simscore
assert last_editdistance <= now_editdistance
assert last_bleu <= now_bleu
def pdf_to_markdown():
"""pdf to md."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
cmd = 'magic-pdf pdf-command --pdf %s --inside_model true' % (pdf_path)
os.system(cmd)
dir_path = os.path.join(pdf_dev_path, 'mineru')
if not os.path.exists(dir_path):
os.makedirs(dir_path, exist_ok=True)
res_path = os.path.join(dir_path, f'{demo_name}.md')
src_path = os.path.join(pdf_res_path, demo_name, 'auto', f'{demo_name}.md')
shutil.copy(src_path, res_path)
def get_score():
"""get score."""
score = calculate_score.Scoring(os.path.join(pdf_dev_path, 'result.json'))
score.calculate_similarity_total('mineru', pdf_dev_path)
res = score.summary_scores()
return res
def clean_magicpdf(pdf_res_path):
"""clean magicpdf."""
cmd = 'rm -rf %s' % (pdf_res_path)
os.system(cmd)
"""test cli and sdk."""
import logging
import os
import pytest
from conf import conf
from lib import common
import time
import magic_pdf.model as model_config
from magic_pdf.data.read_api import read_local_images
from magic_pdf.data.read_api import read_local_office
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
pdf_res_path = conf.conf['pdf_res_path']
code_path = conf.conf['code_path']
pdf_dev_path = conf.conf['pdf_dev_path']
magic_pdf_config = "/home/quyuan/magic-pdf.json"
class TestCli:
"""test cli."""
@pytest.fixture(autouse=True)
def setup(self):
"""
init
"""
common.clear_gpu_memory()
common.update_config_file(magic_pdf_config, "device-mode", "cuda")
# 这里可以添加任何前置操作
yield
@pytest.mark.P0
def test_pdf_local_sdk(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_path)
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
common.delete_file(dir_path)
### draw model result on each page
infer_result.draw_model(os.path.join(dir_path, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
pipe_result.draw_layout(os.path.join(dir_path, f"{name_without_suff}_layout.pdf"))
### draw spans result on each page
pipe_result.draw_span(os.path.join(dir_path, f"{name_without_suff}_spans.pdf"))
### dump markdown
md_content = pipe_result.get_markdown(image_dir)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get middle json
middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_s3_sdk(self):
"""pdf s3 sdk test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'pdf', f'{demo_name}.pdf')
local_image_dir = os.path.join(pdf_dev_path, 'pdf', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pdf")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
pass
@pytest.mark.P0
def test_pdf_local_ppt(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'ppt')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pptx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'ppt', f'{demo_name}.pptx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".pptx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_local_image(self):
"""pdf sdk auto test."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.jpg'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'images', f'{demo_name}.jpg')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".jpg")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(dir_path)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_images(pdf_path)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_local_image_dir(self):
"""local image dir."""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'images')
dir_path = os.path.join(pdf_dev_path, 'mineru')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
common.delete_file(dir_path)
dss = read_local_images(pdf_path, suffixes=['.png', '.jpg'])
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{count}.md", image_dir)
count += 1
common.sdk_count_folders_and_check_contents(dir_path)
def test_local_doc_parse(self):
"""
doc 解析
"""
demo_names = list()
pdf_path = os.path.join(pdf_dev_path, 'doc')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.docx'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
pdf_path = os.path.join(pdf_dev_path, 'doc', f'{demo_name}.docx')
local_image_dir = os.path.join(pdf_dev_path, 'mineru', 'images')
image_dir = str(os.path.basename(local_image_dir))
name_without_suff = os.path.basename(pdf_path).split(".docx")[0]
dir_path = os.path.join(pdf_dev_path, 'mineru')
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(dir_path)
ds = read_local_office(pdf_path)[0]
common.delete_file(dir_path)
ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
common.sdk_count_folders_and_check_contents(dir_path)
@pytest.mark.P0
def test_pdf_cli_auto(self):
"""magic_pdf cli test auto."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'auto')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'auto'))
@pytest.mark.P0
def test_pdf_cli_txt(self):
"""magic_pdf cli test txt."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'txt')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'txt'))
@pytest.mark.P0
def test_pdf_cli_ocr(self):
"""magic_pdf cli test ocr."""
time.sleep(2)
demo_names = []
pdf_path = os.path.join(pdf_dev_path, 'pdf')
for pdf_file in os.listdir(pdf_path):
if pdf_file.endswith('.pdf'):
demo_names.append(pdf_file.split('.')[0])
for demo_name in demo_names:
res_path = os.path.join(pdf_dev_path, 'mineru')
common.delete_file(res_path)
cmd = 'magic-pdf -p %s -o %s -m %s' % (os.path.join(
pdf_path, f'{demo_name}.pdf'), res_path, 'ocr')
logging.info(cmd)
os.system(cmd)
common.cli_count_folders_and_check_contents(
os.path.join(res_path, demo_name, 'ocr'))
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_txt(self):
"""magic_pdf_dev cli local txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_ocr(self):
"""magic_pdf_dev cli local ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_local_jsonl_auto(self):
"""magic_pdf_dev cli local auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_txt(self):
"""magic_pdf_dev cli s3 txt."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, "txt")
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_ocr(self):
"""magic_pdf_dev cli s3 ocr."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'ocr')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_s3_jsonl_auto(self):
"""magic_pdf_dev cli s3 auto."""
time.sleep(2)
jsonl_path = os.path.join(pdf_dev_path, 'line1.jsonl')
cmd = 'magic-pdf-dev --jsonl %s --method %s' % (jsonl_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_pdf_dev_cli_pdf_json_auto(self):
"""magic_pdf_dev cli pdf+json auto."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.skip(reason='out-of-date api')
@pytest.mark.P1
def test_pdf_dev_cli_pdf_json_ocr(self):
"""magic_pdf_dev cli pdf+json ocr."""
time.sleep(2)
json_path = os.path.join(pdf_dev_path, 'test_model.json')
pdf_path = os.path.join(pdf_dev_path, 'pdf', 'test_rearch_report.pdf')
cmd = 'magic-pdf-dev --pdf %s --json %s --method %s' % (pdf_path, json_path, 'auto')
logging.info(cmd)
os.system(cmd)
@pytest.mark.P1
def test_local_magic_pdf_open_rapidai_table(self):
"""magic pdf cli open rapid ai table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
@pytest.mark.P1
def test_local_magic_pdf_doclayout_yolo(self):
"""magic pdf cli open doclyaout yolo."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "doclayout_yolo"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.skip(reason="layoutlmv3废弃")
@pytest.mark.P1
def test_local_magic_pdf_layoutlmv3_yolo(self):
"""magic pdf cli open layoutlmv3."""
time.sleep(2)
value = {
"model": "layoutlmv3"
}
common.update_config_file(magic_pdf_config, "layout-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
#res = common.check_html_table_exists(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
@pytest.mark.P1
def test_magic_pdf_cpu(self):
"""magic pdf cli cpu mode."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_html_table_cpu.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": True,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
common.update_config_file(magic_pdf_config, "device-mode", "cpu")
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
common.cli_count_folders_and_check_contents(os.path.join(pdf_res_path, "test_rearch_report", "auto"))
@pytest.mark.P1
def test_local_magic_pdf_close_html_table(self):
"""magic pdf cli close table."""
time.sleep(2)
#pre_cmd = "cp ~/magic_pdf_close_table.json ~/magic-pdf.json"
#os.system(pre_cmd)
value = {
"model": "rapid_table",
"enable": False,
"sub_model": "slanet_plus",
"max_time": 400
}
common.update_config_file(magic_pdf_config, "table-config", value)
pdf_path = os.path.join(pdf_dev_path, "pdf", "test_rearch_report.pdf")
common.delete_file(pdf_res_path)
cli_cmd = "magic-pdf -p %s -o %s" % (pdf_path, pdf_res_path)
os.system(cli_cmd)
res = common.check_close_tables(os.path.join(pdf_res_path, "test_rearch_report", "auto", "test_rearch_report.md"))
assert res is True
if __name__ == '__main__':
pytest.main()
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/unittest/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
import os
import shutil
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
def test_filebased_reader_writer():
unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
sub_dir = os.path.join(unitest_dir, 'sub')
abs_fn = os.path.join(unitest_dir, 'abspath.txt')
os.makedirs(sub_dir, exist_ok=True)
writer = FileBasedDataWriter(sub_dir)
reader = FileBasedDataReader(sub_dir)
writer.write('test.txt', b'hello world')
assert reader.read('test.txt') == b'hello world'
writer.write(abs_fn, b'hello world')
assert reader.read(abs_fn) == b'hello world'
shutil.rmtree(unitest_dir)
import json
import os
import fitz
import pytest
from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
writer = MultiBucketS3DataWriter(bucket, s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
prefix = 'meta-index'
reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader('', bucket, ak, sk, endpoint_url)
writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
prefix = 'meta-index'
reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.io.s3 import S3Reader, S3Writer
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_reader():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert len(bits) > 0
bits = reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
566,
713,
)
assert len(json.loads(bits)) > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_writer():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
test_fn = 'unittest/io/test.jsonl'
writer.write(test_fn, '123'.encode())
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(test_fn)
assert bits.decode() == '123'
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/unittest/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/unittest/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
import pytest
import json
from magic_pdf.libs.json_compressor import JsonCompressor
# Test data fixtures
@pytest.fixture
def test_cases():
return [
# Simple dictionary
{"name": "John", "age": 30},
# Nested dictionary
{
"person": {
"name": "Alice",
"address": {
"street": "123 Main St",
"city": "New York"
}
}
},
# List of dictionaries
[
{"id": 1, "value": "first"},
{"id": 2, "value": "second"}
],
# Dictionary with various data types
{
"string": "hello",
"integer": 42,
"float": 3.14,
"boolean": True,
"null": None,
"array": [1, 2, 3],
"nested": {"key": "value"}
},
# Empty structures
{},
[],
{"empty_list": [], "empty_dict": {}}
]
@pytest.fixture
def large_data():
return {
"data": ["test" * 100] * 100 # Create a large repeated string
}
def test_compression_decompression_cycle(test_cases):
"""Test that data remains intact after compression and decompression"""
for test_data in test_cases:
# Compress the data
compressed = JsonCompressor.compress_json(test_data)
# Verify compressed string is not empty and is a string
assert isinstance(compressed, str)
assert len(compressed) > 0
# Decompress the data
decompressed = JsonCompressor.decompress_json(compressed)
# Verify the decompressed data matches original
assert test_data == decompressed
def test_compression_reduces_size(large_data):
"""Test that compression actually reduces data size for large enough input"""
original_size = len(json.dumps(large_data))
compressed = JsonCompressor.compress_json(large_data)
compressed_size = len(compressed)
# Verify compression actually saved space
assert compressed_size < original_size
def test_invalid_json_serializable():
"""Test handling of non-JSON serializable input"""
with pytest.raises(TypeError):
JsonCompressor.compress_json(set([1, 2, 3])) # sets are not JSON serializable
def test_invalid_compressed_string():
"""Test handling of invalid compressed string"""
with pytest.raises(Exception):
JsonCompressor.decompress_json("invalid_base64_string")
def test_empty_string_input():
"""Test handling of empty string input"""
with pytest.raises(Exception):
JsonCompressor.decompress_json("")
def test_special_characters():
"""Test handling of special characters"""
test_data = {
"special": "!@#$%^&*()_+-=[]{}|;:,.<>?",
"unicode": "Hello 世界 🌍"
}
compressed = JsonCompressor.compress_json(test_data)
decompressed = JsonCompressor.decompress_json(compressed)
assert test_data == decompressed
# Parametrized test for different types of input
@pytest.mark.parametrize("test_input", [
{"simple": "value"},
[1, 2, 3],
{"nested": {"key": "value"}},
["mixed", 1, True, None],
{"unicode": "🌍"}
])
def test_various_input_types(test_input):
"""Test compression and decompression with various input types"""
compressed = JsonCompressor.compress_json(test_input)
decompressed = JsonCompressor.decompress_json(compressed)
assert test_input == decompressed
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment