Unverified Commit 3a42ebbf authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #838 from opendatalab/release-0.9.0

Release 0.9.0
parents 765c6d77 14024793
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
import os
import pytest
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.read_api import (read_jsonl, read_local_images,
read_local_pdfs)
from magic_pdf.data.schemas import S3Config
def test_read_local_pdfs():
datasets = read_local_pdfs('tests/test_data/assets/pdfs')
assert len(datasets) == 2
assert len(datasets[0]) > 0
assert len(datasets[1]) > 0
assert datasets[0].get_page(0).get_page_info().w > 0
assert datasets[0].get_page(0).get_page_info().h > 0
def test_read_local_images():
datasets = read_local_images('tests/test_data/assets/pngs', suffixes=['png'])
assert len(datasets) == 2
assert len(datasets[0]) == 1
assert len(datasets[1]) == 1
assert datasets[0].get_page(0).get_page_info().w > 0
assert datasets[0].get_page(0).get_page_info().h > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_read_json():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
reader,
)
assert len(datasets) > 0
assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_01.jsonl', reader)
assert len(datasets) == 1
assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_02.jsonl')
assert len(datasets) == 1
assert len(datasets[0]) == 1
[
{
"layout_dets": [
{
"category_id": 3,
"poly": [
776.7277221679688,
688.448974609375,
1242.224365234375,
688.448974609375,
1242.224365234375,
1182.0628662109375,
776.7277221679688,
1182.0628662109375
],
"score": 0.999997079372406
},
{
"category_id": 3,
"poly": [
775.9269409179688,
1389.754638671875,
1243.672119140625,
1389.754638671875,
1243.672119140625,
1859.716064453125,
775.9269409179688,
1859.716064453125
],
"score": 0.9999949932098389
},
{
"category_id": 1,
"poly": [
752.11572265625,
1939.3634033203125,
1430.1146240234375,
1939.3634033203125,
1430.1146240234375,
2041.1771240234375,
752.11572265625,
2041.1771240234375
],
"score": 0.999975323677063
},
{
"category_id": 3,
"poly": [
46.55152893066406,
686.12939453125,
638.8861083984375,
686.12939453125,
638.8861083984375,
1803.419189453125,
46.55152893066406,
1803.419189453125
],
"score": 0.999961256980896
},
{
"category_id": 3,
"poly": [
33.684722900390625,
150.77980041503906,
1238.0679931640625,
150.77980041503906,
1238.0679931640625,
524.98291015625,
33.684722900390625,
524.98291015625
],
"score": 0.9999504089355469
},
{
"category_id": 1,
"poly": [
24.685693740844727,
1875.9998779296875,
703.5064697265625,
1875.9998779296875,
703.5064697265625,
2050.7431640625,
24.685693740844727,
2050.7431640625
],
"score": 0.9999105334281921
},
{
"category_id": 1,
"poly": [
750.97705078125,
1252.206787109375,
1430.0809326171875,
1252.206787109375,
1430.0809326171875,
1357.2947998046875,
750.97705078125,
1357.2947998046875
],
"score": 0.999853789806366
},
{
"category_id": 4,
"poly": [
904.842041015625,
1213.027099609375,
1273.5655517578125,
1213.027099609375,
1273.5655517578125,
1242.717529296875,
904.842041015625,
1242.717529296875
],
"score": 0.9995817542076111
},
{
"category_id": 4,
"poly": [
905.3208618164062,
1898.5325927734375,
1273.1282958984375,
1898.5325927734375,
1273.1282958984375,
1928.9906005859375,
905.3208618164062,
1928.9906005859375
],
"score": 0.9986443519592285
},
{
"category_id": 4,
"poly": [
372.0135498046875,
556.02685546875,
1084.9647216796875,
556.02685546875,
1084.9647216796875,
586.6792602539062,
372.0135498046875,
586.6792602539062
],
"score": 0.9985352754592896
},
{
"category_id": 2,
"poly": [
1350.63671875,
79.77919006347656,
1379.6220703125,
79.77919006347656,
1379.6220703125,
99.83788299560547,
1350.63671875,
99.83788299560547
],
"score": 0.9973036646842957
},
{
"category_id": 4,
"poly": [
203.2659912109375,
597.2034912109375,
1251.0240478515625,
597.2034912109375,
1251.0240478515625,
657.985595703125,
203.2659912109375,
657.985595703125
],
"score": 0.9622809886932373
},
{
"category_id": 0,
"poly": [
70.87332916259766,
1834.5714111328125,
657.8504638671875,
1834.5714111328125,
657.8504638671875,
1865.07373046875,
70.87332916259766,
1865.07373046875
],
"score": 0.8580453395843506
},
{
"category_id": 1,
"poly": [
189.0360870361328,
597.2406616210938,
1252.3204345703125,
597.2406616210938,
1252.3204345703125,
658.4781494140625,
189.0360870361328,
658.4781494140625
],
"score": 0.3083903193473816
},
{
"category_id": 13,
"poly": [
1190,
1980,
1206,
1980,
1206,
1997,
1190,
1997
],
"score": 0.51,
"latex": ":"
},
{
"category_id": 13,
"poly": [
1219,
1331,
1235,
1331,
1235,
1348,
1219,
1348
],
"score": 0.49,
"latex": ":"
},
{
"category_id": 13,
"poly": [
798,
2016,
813,
2016,
813,
2033,
798,
2033
],
"score": 0.41,
"latex": ":"
},
{
"category_id": 13,
"poly": [
135,
1991,
148,
1991,
148,
2006,
135,
2006
],
"score": 0.39,
"latex": ":"
},
{
"category_id": 13,
"poly": [
400,
1916,
416,
1916,
416,
1933,
400,
1933
],
"score": 0.38,
"latex": ":"
},
{
"category_id": 13,
"poly": [
1148,
1944,
1162,
1944,
1162,
1961,
1148,
1961
],
"score": 0.31,
"latex": ":"
},
{
"category_id": 15,
"poly": [
798.0,
1943.0,
1147.0,
1943.0,
1147.0,
1968.0,
798.0,
1968.0
],
"score": 0.95,
"text": "Fig 4 SSCP analysis of FHIT exon 4. T"
},
{
"category_id": 15,
"poly": [
1163.0,
1943.0,
1425.0,
1943.0,
1425.0,
1968.0,
1163.0,
1968.0
],
"score": 0.96,
"text": "Tumor tissue ; N :Corresponding"
},
{
"category_id": 15,
"poly": [
755.0,
1979.0,
1189.0,
1979.0,
1189.0,
2004.0,
755.0,
2004.0
],
"score": 0.92,
"text": "normal tissue ; M : PBR322/Hae II Marker ; ssDNA"
},
{
"category_id": 15,
"poly": [
1207.0,
1979.0,
1422.0,
1979.0,
1422.0,
2004.0,
1207.0,
2004.0
],
"score": 0.97,
"text": "Single-stranded DNA ; ds-"
},
{
"category_id": 15,
"poly": [
755.0,
2015.0,
797.0,
2015.0,
797.0,
2038.0,
755.0,
2038.0
],
"score": 1.0,
"text": "DNA"
},
{
"category_id": 15,
"poly": [
814.0,
2015.0,
996.0,
2015.0,
996.0,
2038.0,
814.0,
2038.0
],
"score": 0.98,
"text": "Double-stranded DNA"
},
{
"category_id": 15,
"poly": [
71.0,
1880.0,
698.0,
1880.0,
698.0,
1902.0,
71.0,
1902.0
],
"score": 0.96,
"text": "Fig 2Alterations of PCR amplified products of FHIT exon 3,4,5 and"
},
{
"category_id": 15,
"poly": [
28.0,
1916.0,
399.0,
1916.0,
399.0,
1937.0,
28.0,
1937.0
],
"score": 0.98,
"text": "microsatellite marker D3S1300、D3S1312.A"
},
{
"category_id": 15,
"poly": [
417.0,
1916.0,
701.0,
1916.0,
701.0,
1937.0,
417.0,
1937.0
],
"score": 0.9,
"text": "Deletion of exon5(arrows);B :"
},
{
"category_id": 15,
"poly": [
29.0,
1953.0,
700.0,
1953.0,
700.0,
1974.0,
29.0,
1974.0
],
"score": 0.95,
"text": "Deletion of exon 3 A( arrows);C : Deletion of microsatellite marker D3S1300,"
},
{
"category_id": 15,
"poly": [
28.0,
1989.0,
134.0,
1989.0,
134.0,
2014.0,
28.0,
2014.0
],
"score": 1.0,
"text": "D3S1312.T"
},
{
"category_id": 15,
"poly": [
149.0,
1989.0,
696.0,
1989.0,
696.0,
2014.0,
149.0,
2014.0
],
"score": 0.96,
"text": "Tumor ; N : Corresponding normal tissue ; L : Corresponding lymph"
},
{
"category_id": 15,
"poly": [
30.0,
2027.0,
634.0,
2027.0,
634.0,
2047.0,
30.0,
2047.0
],
"score": 0.94,
"text": "node tissue;M :DL2000 DNA marker;L1:Lewis ;A :A549;S SPAC-1"
},
{
"category_id": 15,
"poly": [
801.0,
1259.0,
1427.0,
1259.0,
1427.0,
1280.0,
801.0,
1280.0
],
"score": 0.94,
"text": "Fig 3SSCP analysis of FHIT exon 3.The arrow indicateda deletion of"
},
{
"category_id": 15,
"poly": [
757.0,
1294.0,
1424.0,
1294.0,
1424.0,
1318.0,
757.0,
1318.0
],
"score": 0.96,
"text": "exon 3 of 41T. T : Tumor tissue ; N : Corresponding normal tissue ; M PBR322/"
},
{
"category_id": 15,
"poly": [
755.0,
1329.0,
1218.0,
1329.0,
1218.0,
1355.0,
755.0,
1355.0
],
"score": 0.95,
"text": "Hae Il Marker / ssDNA : Single-stranded DNA ; dsDNA"
},
{
"category_id": 15,
"poly": [
1236.0,
1329.0,
1418.0,
1329.0,
1418.0,
1355.0,
1236.0,
1355.0
],
"score": 1.0,
"text": "Double-strandedDNA"
},
{
"category_id": 15,
"poly": [
910.0,
1217.0,
1269.0,
1217.0,
1269.0,
1241.0,
910.0,
1241.0
],
"score": 1.0,
"text": "图3FHIT基因外显子3的SSCP分析"
},
{
"category_id": 15,
"poly": [
909.0,
1904.0,
1269.0,
1904.0,
1269.0,
1927.0,
909.0,
1927.0
],
"score": 1.0,
"text": "图4FHIT基因外显子4的SSCP分析"
},
{
"category_id": 15,
"poly": [
374.0,
563.0,
1077.0,
563.0,
1077.0,
583.0,
374.0,
583.0
],
"score": 0.99,
"text": "图1FHIT基因外显子3、4、5、8和微卫星灶的PCR扩增产物琼脂糖电泳图"
},
{
"category_id": 15,
"poly": [
1351.0,
81.0,
1376.0,
81.0,
1376.0,
102.0,
1351.0,
102.0
],
"score": 1.0,
"text": "13"
},
{
"category_id": 15,
"poly": [
207.0,
600.0,
1245.0,
600.0,
1245.0,
624.0,
207.0,
624.0
],
"score": 0.96,
"text": "Fig 1 Agarose electrophoresis of PCR products of exor( A)3 ,4 ,5 ,8 and three microsatellite markers( B)of FHIT gene"
},
{
"category_id": 15,
"poly": [
309.0,
634.0,
1142.0,
634.0,
1142.0,
662.0,
309.0,
662.0
],
"score": 0.97,
"text": "M1 :DL2000 DNA marker ; M2 PBR322/Hae Il marker ; T :Tumor ; N :Corresponding normal tissue"
},
{
"category_id": 15,
"poly": [
73.0,
1840.0,
651.0,
1840.0,
651.0,
1864.0,
73.0,
1864.0
],
"score": 1.0,
"text": "图2FHIT基因外显子和微卫星灶PCR扩增产物缺失电泳图"
},
{
"category_id": 15,
"poly": [
207.0,
600.0,
1245.0,
600.0,
1245.0,
625.0,
207.0,
625.0
],
"score": 0.96,
"text": "Fig 1 Agarose electrophoresis of PCR products of exor A)3 ,4 ,5 ,8 and three microsatellite markers( B)of FHIT gene"
},
{
"category_id": 15,
"poly": [
309.0,
635.0,
1142.0,
635.0,
1142.0,
661.0,
309.0,
661.0
],
"score": 0.97,
"text": "M1 :DL2000 DNA marker ; M2 PBR322/Hae Il marker ; T Tumor ; N :Corresponding normal tissue"
}
],
"page_info": {
"page_no": 0,
"height": 2080,
"width": 1472
}
}
]
This source diff could not be displayed because it is too large. You can view the blob instead.
import json
from magic_pdf.data.read_api import read_local_pdfs
from magic_pdf.model.magic_model import MagicModel
def test_magic_model_image_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_01.pdf')
with open('tests/test_model/assets/test_01.model.json') as f:
model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0])
imgs = magic_model.get_imgs_v2(0)
print(imgs)
tables = magic_model.get_tables_v2(0)
print(tables)
def test_magic_model_table_v2():
datasets = read_local_pdfs('tests/test_model/assets/test_02.pdf')
with open('tests/test_model/assets/test_02.model.json') as f:
model_json = json.load(f)
magic_model = MagicModel(model_json, datasets[0])
tables = magic_model.get_tables_v2(5)
print(tables)
tables = magic_model.get_tables_v2(8)
print(tables)
import pytest
from PIL import Image
from magic_pdf.model.ppTableModel import ppTableModel
class TestppTableModel:
def test_image2html(self):
img = Image.open("tests/unittest/test_table/assets/table.jpg")
# 修改table模型路径
config = {"device": "cuda",
"model_dir": "/home/quyuan/.cache/modelscope/hub/opendatalab/PDF-Extract-Kit/models/TabRec/TableMaster"}
table_model = ppTableModel(config)
res = table_model.img2html(img)
true_value = """<td><table border="1"><thead><tr><td><b>Methods</b></td><td><b>R</b></td><td><b>P</b></td><td><b>F</b></td><td><b>FPS</b></td></tr></thead><tbody><tr><td>SegLink[26]</td><td>70.0</td><td>86.0</td><td>77.0</td><td>8.9</td></tr><tr><td>PixelLink[4]</td><td>73.2</td><td>83.0</td><td>77.8</td></tr><tr><td>TextSnake [18]</td><td>73.9</td><td>83.2</td><td>78.3</td><td>1.1</td></tr><tr><td>TextField [37]</td><td>75.9</td><td>87.4</td><td>81.3</td><td>5.2</td></tr><tr><td>MSR[38]</td><td>76.7</td><td>87.4</td><td>81.7</td><td>-</td></tr><tr><td>FTSN [3]</td><td>77.1</td><td>87.6</td><td>82.0</td><td>-</td></tr><tr><td>LSE[30]</td><td>81.7</td><td>84.2</td><td>82.9</td><td>-</td></tr><tr><td>CRAFT [2]</td><td>78.2</td><td>88.2</td><td>82.9</td><td>8.6</td></tr><tr><td>MCN[16]</td><td>79</td><td>88</td><td>83</td><td>-</td></tr><tr><td>ATRR[35]</td><td>82.1</td><td>85.2</td><td>83.6</td><td></td></tr><tr><td>PAN [34]</td><td>83.8</td><td>84.4</td><td>84.1</td><td>30.2</td></tr><tr><td>DB[12]</td><td>79.2</td><td>91.5</td><td>84.9</td><td>32.0</td></tr><tr><td>DRRG [41]</td><td>82.30</td><td>88.05</td><td>85.08</td></tr><tr><td>Ours (SynText)</td><td>80.68</td><td>85.40</td><td>82.97</td><td>12.68</td></tr><tr><td>Ours (MLT-17)</td><td>84.54</td><td>86.62</td></tr></tbody></table></td>\n"""
assert res == true_value
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment