".github/vscode:/vscode.git/clone" did not exist on "3ed0a547b233eaf1153409ba4e59a21da0aa3883"
Commit 90cf1082 authored by myhloli's avatar myhloli
Browse files

test(unitest): Restore unit test cases

parent 8e981b3a
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"s3://sci-hub/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
{"track_id":"e8824f5a-9fcb-4ee5-b2d4-6bf2c67019dc","path":"tests/test_data/assets/pdfs/test_02.pdf","file_type":"pdf","content_type":"application/pdf","content_length":80078,"title":"German Idealism and the Concept of Punishment || Conclusion","remark":{"file_id":"scihub_78800000/libgen.scimag78872000-78872999.zip_10.1017/cbo9780511770425.012","file_source_type":"paper","original_file_id":"10.1017/cbo9780511770425.012","file_name":"10.1017/cbo9780511770425.012.pdf","author":"Merle, Jean-Christophe"}}
import os
import shutil
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
def test_filebased_reader_writer():
unitest_dir = '/tmp/magic_pdf/unittest/data/filebased_reader_writer'
sub_dir = os.path.join(unitest_dir, 'sub')
abs_fn = os.path.join(unitest_dir, 'abspath.txt')
os.makedirs(sub_dir, exist_ok=True)
writer = FileBasedDataWriter(sub_dir)
reader = FileBasedDataReader(sub_dir)
writer.write('test.txt', b'hello world')
assert reader.read('test.txt') == b'hello world'
writer.write(abs_fn, b'hello world')
assert reader.read(abs_fn) == b'hello world'
shutil.rmtree(unitest_dir)
import json
import os
import fitz
import pytest
from magic_pdf.data.data_reader_writer import (MultiBucketS3DataReader,
MultiBucketS3DataWriter)
from magic_pdf.data.schemas import S3Config
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
writer = MultiBucketS3DataWriter(bucket, s3configs)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_multi_bucket_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
prefix = 'meta-index'
reader = MultiBucketS3DataReader(f'{bucket}/{prefix}', s3configs)
writer = MultiBucketS3DataWriter(f'{bucket}/{prefix}', s3configs)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
f's3://{bucket_2}/enbook-scimag/78800000/libgen.scimag78872000-78872999/10.1017/cbo9780511770425.012.pdf'
)
docs = fitz.open('pdf', bits)
assert len(docs) == 10
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3DataReader('', bucket, ak, sk, endpoint_url)
writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
prefix = 'meta-index'
reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)
bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')
assert bits == reader.read(
f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
bits = reader.read(
'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
)
assert bits == reader.read_at(
'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
)
assert len(json.loads(bits)) > 0
writer.write_string(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
)
assert 'abc'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
assert 'abc'.encode() == reader.read(
f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
)
writer.write(
f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
'123'.encode(),
)
assert '123'.encode() == reader.read(
'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
)
import json
import os
import pytest
from magic_pdf.data.io.s3 import S3Reader, S3Writer
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_reader():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
)
assert len(bits) > 0
bits = reader.read_at(
'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
566,
713,
)
assert len(json.loads(bits)) > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY', None) is None, reason='s3 config not found'
)
def test_s3_writer():
"""test s3 reader.
must config s3 config in the environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export S3_SECRET_KEY=xxx
export S3_ENDPOINT=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
writer = S3Writer(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
test_fn = 'unittest/io/test.jsonl'
writer.write(test_fn, '123'.encode())
reader = S3Reader(bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint_url)
bits = reader.read(test_fn)
assert bits.decode() == '123'
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
def test_pymudataset():
with open('tests/test_data/assets/pdfs/test_01.pdf', 'rb') as f:
bits = f.read()
datasets = PymuDocDataset(bits)
assert len(datasets) > 0
assert datasets.get_page(0).get_page_info().h > 100
def test_imagedataset():
with open('tests/test_data/assets/pngs/test_01.png', 'rb') as f:
bits = f.read()
datasets = ImageDataset(bits)
assert len(datasets) == 1
assert datasets.get_page(0).get_page_info().w > 100
import os
import pytest
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.read_api import (read_jsonl, read_local_images,
read_local_pdfs)
from magic_pdf.data.schemas import S3Config
def test_read_local_pdfs():
datasets = read_local_pdfs('tests/test_data/assets/pdfs')
assert len(datasets) == 2
assert len(datasets[0]) > 0
assert len(datasets[1]) > 0
assert datasets[0].get_page(0).get_page_info().w > 0
assert datasets[0].get_page(0).get_page_info().h > 0
def test_read_local_images():
datasets = read_local_images('tests/test_data/assets/pngs', suffixes=['png'])
assert len(datasets) == 2
assert len(datasets[0]) == 1
assert len(datasets[1]) == 1
assert datasets[0].get_page(0).get_page_info().w > 0
assert datasets[0].get_page(0).get_page_info().h > 0
@pytest.mark.skipif(
os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_read_json():
"""test multi bucket s3 reader writer must config s3 config in the
environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.
export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
"""
bucket = os.getenv('S3_BUCKET', '')
ak = os.getenv('S3_ACCESS_KEY', '')
sk = os.getenv('S3_SECRET_KEY', '')
endpoint_url = os.getenv('S3_ENDPOINT', '')
bucket_2 = os.getenv('S3_BUCKET_2', '')
ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
sk_2 = os.getenv('S3_SECRET_KEY_2', '')
endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(
f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
reader,
)
assert len(datasets) > 0
assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_01.jsonl', reader)
assert len(datasets) == 1
assert len(datasets[0]) == 10
datasets = read_jsonl('tests/test_data/assets/jsonl/test_02.jsonl')
assert len(datasets) == 1
assert len(datasets[0]) == 1
{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "text",
"bbox": [
47,
57,
299,
93
],
"lines": [
{
"bbox": [
47,
57,
299,
68
],
"spans": [
{
"bbox": [
49,
57,
298,
68
],
"score": 0.98,
"content": "of the synthetic stereo scene from a single camera perspective",
"type": "text"
}
]
},
{
"bbox": [
47,
71,
299,
80
],
"spans": [
{
"bbox": [
49,
71,
299,
80
],
"score": 0.96,
"content": "along with the ground truth disparity,occlusion map,and",
"type": "text"
}
]
},
{
"bbox": [
47,
82,
123,
93
],
"spans": [
{
"bbox": [
49,
82,
123,
93
],
"score": 0.99,
"content": "discontinuitymap.",
"type": "text"
}
]
}
]
},
{
"type": "image",
"bbox": [
47,
100,
301,
535
],
"blocks": [
{
"bbox": [
51,
100,
292,
484
],
"type": "image_body",
"lines": [
{
"bbox": [
51,
100,
292,
484
],
"spans": [
{
"bbox": [
51,
100,
292,
484
],
"score": 0.9999815225601196,
"type": "image",
"image_path": "b07d74524eac6f46b5505b48b1e10db23f2b45cb2d21d5fec72e967e61255811.jpg"
}
]
}
]
},
{
"bbox": [
47,
488,
301,
535
],
"type": "image_caption",
"lines": [
{
"bbox": [
49,
490,
299,
499
],
"spans": [
{
"bbox": [
49,
490,
299,
499
],
"score": 1.0,
"content": "Figure2:Twosampleframesfromthesyntheticvideose-",
"type": "text"
}
]
},
{
"bbox": [
48,
501,
300,
512
],
"spans": [
{
"bbox": [
48,
501,
300,
512
],
"score": 1.0,
"content": "quence (1st row), along with their corresponding ground truth",
"type": "text"
}
]
},
{
"bbox": [
48,
513,
299,
523
],
"spans": [
{
"bbox": [
48,
513,
299,
523
],
"score": 0.98,
"content": "disparity (2nd row), occlusion map (3rd row), and discontinuity",
"type": "text"
}
]
},
{
"bbox": [
48,
525,
110,
535
],
"spans": [
{
"bbox": [
48,
525,
110,
535
],
"score": 0.99,
"content": "map (4th row).",
"type": "text"
}
]
}
]
}
]
},
{
"type": "text",
"bbox": [
47,
549,
299,
678
],
"lines": [
{
"bbox": [
58,
549,
299,
558
],
"spans": [
{
"bbox": [
58,
549,
298,
558
],
"score": 0.98,
"content": "Theresultsof temporalstereomatching aregiveninFigure",
"type": "text"
}
]
},
{
"bbox": [
47,
561,
299,
570
],
"spans": [
{
"bbox": [
47,
561,
298,
570
],
"score": 0.98,
"content": "3foruniformadditivenoiseconfinedtotherangesof±O",
"type": "text"
}
]
},
{
"bbox": [
47,
573,
299,
582
],
"spans": [
{
"bbox": [
49,
573,
299,
582
],
"score": 0.96,
"content": "±20, and ±40. Each performance plot is given as a function",
"type": "text"
}
]
},
{
"bbox": [
47,
585,
299,
594
],
"spans": [
{
"bbox": [
48,
585,
299,
594
],
"score": 0.95,
"content": "of the feedback coefficient X. As with the majority of temporal",
"type": "text"
}
]
},
{
"bbox": [
47,
597,
299,
606
],
"spans": [
{
"bbox": [
49,
597,
299,
606
],
"score": 0.99,
"content": "stereomatching methods,improvements are negligible when",
"type": "text"
}
]
},
{
"bbox": [
47,
609,
299,
618
],
"spans": [
{
"bbox": [
48,
609,
299,
618
],
"score": 0.97,
"content": "no noise is added to the images [1o], [19]. This is largely due",
"type": "text"
}
]
},
{
"bbox": [
47,
621,
299,
629
],
"spans": [
{
"bbox": [
48,
621,
299,
629
],
"score": 1.0,
"content": "tothefactthatthevideousedtoevaluatethesemethodsis",
"type": "text"
}
]
},
{
"bbox": [
47,
633,
299,
641
],
"spans": [
{
"bbox": [
48,
633,
299,
641
],
"score": 1.0,
"content": "computergeneratedwithverylittlenoisetostartwith,thus",
"type": "text"
}
]
},
{
"bbox": [
47,
644,
299,
654
],
"spans": [
{
"bbox": [
48,
644,
299,
654
],
"score": 0.98,
"content": "the noise suppression achieved with temporal stereo matching",
"type": "text"
}
]
},
{
"bbox": [
47,
657,
299,
666
],
"spans": [
{
"bbox": [
48,
657,
299,
666
],
"score": 0.98,
"content": "showslittletonoimprovementovermethodsthatoperate on",
"type": "text"
}
]
},
{
"bbox": [
47,
669,
113,
678
],
"spans": [
{
"bbox": [
48,
669,
113,
678
],
"score": 1.0,
"content": "pairsofimages.",
"type": "text"
}
]
}
]
},
{
"type": "text",
"bbox": [
47,
680,
299,
725
],
"lines": [
{
"bbox": [
58,
680,
299,
690
],
"spans": [
{
"bbox": [
59,
680,
298,
690
],
"score": 0.97,
"content": "Significantimprovementsin accuracy canbeseenin Figure",
"type": "text"
}
]
},
{
"bbox": [
47,
692,
299,
701
],
"spans": [
{
"bbox": [
48,
692,
298,
701
],
"score": 0.97,
"content": "3 when the noise has ranges of ±20, and ±40.In this scenario",
"type": "text"
}
]
},
{
"bbox": [
47,
703,
299,
714
],
"spans": [
{
"bbox": [
48,
703,
299,
714
],
"score": 0.98,
"content": "the effect of noise in the current frame is reduced by increasing",
"type": "text"
}
]
},
{
"bbox": [
47,
716,
299,
725
],
"spans": [
{
"bbox": [
48,
716,
299,
725
],
"score": 0.96,
"content": "thefeedbackcoefficientX.Thisincreasing ofXhas theeffect",
"type": "text"
}
]
}
]
},
{
"type": "image",
"bbox": [
310,
55,
564,
371
],
"blocks": [
{
"bbox": [
314,
55,
538,
305
],
"type": "image_body",
"lines": [
{
"bbox": [
314,
55,
538,
305
],
"spans": [
{
"bbox": [
314,
55,
538,
305
],
"score": 0.9999905824661255,
"type": "image",
"image_path": "c7539af438972442d0f86aa46409e6684338ddfd1fbfd6bdacf02220853ccb55.jpg"
}
]
}
]
},
{
"bbox": [
310,
311,
564,
371
],
"type": "image_caption",
"lines": [
{
"bbox": [
312,
313,
562,
322
],
"spans": [
{
"bbox": [
312,
313,
562,
322
],
"score": 0.97,
"content": "Figure 3: Performance of temporal matching at different levels",
"type": "text"
}
]
},
{
"bbox": [
312,
325,
561,
334
],
"spans": [
{
"bbox": [
312,
325,
561,
334
],
"score": 0.98,
"content": "of uniformly distributed image noise{±0,±20,±40}.Mean",
"type": "text"
}
]
},
{
"bbox": [
311,
336,
563,
347
],
"spans": [
{
"bbox": [
311,
336,
563,
347
],
"score": 0.99,
"content": "squared error (MSE) of disparities is plotted versus the values",
"type": "text"
}
]
},
{
"bbox": [
311,
348,
561,
358
],
"spans": [
{
"bbox": [
311,
348,
561,
358
],
"score": 0.96,
"content": "of the feedback coefficient X. Dashed lines correspond to the",
"type": "text"
}
]
},
{
"bbox": [
311,
360,
535,
371
],
"spans": [
{
"bbox": [
311,
360,
535,
371
],
"score": 0.96,
"content": "values of MSE obtained without temporal aggregation.",
"type": "text"
}
]
}
]
}
]
},
{
"type": "image",
"bbox": [
310,
418,
563,
666
],
"blocks": [
{
"bbox": [
314,
418,
549,
623
],
"type": "image_body",
"lines": [
{
"bbox": [
314,
418,
549,
623
],
"spans": [
{
"bbox": [
314,
418,
549,
623
],
"score": 0.9999067783355713,
"type": "image",
"image_path": "9ac4db9197801de4a20dbc9ea17bc0c53afb7290dc8b5b45d9e92e830566cb14.jpg"
}
]
}
]
},
{
"bbox": [
310,
630,
563,
666
],
"type": "image_caption",
"lines": [
{
"bbox": [
312,
631,
562,
641
],
"spans": [
{
"bbox": [
312,
631,
562,
641
],
"score": 0.94,
"content": "Figure 4:Optimal values of the feedback coefficient \\ cor-",
"type": "text"
}
]
},
{
"bbox": [
312,
644,
561,
652
],
"spans": [
{
"bbox": [
312,
644,
561,
652
],
"score": 0.97,
"content": "responding to the smallest mean squared error (MSE)of the",
"type": "text"
}
]
},
{
"bbox": [
312,
655,
513,
665
],
"spans": [
{
"bbox": [
312,
655,
513,
665
],
"score": 0.97,
"content": "disparity estimates for a range of noise strengths.",
"type": "text"
}
]
}
]
}
]
},
{
"type": "text",
"bbox": [
311,
692,
563,
725
],
"lines": [
{
"bbox": [
311,
692,
563,
702
],
"spans": [
{
"bbox": [
311,
692,
562,
702
],
"score": 0.95,
"content": "of averaging out noise in the per-pixel costs by selecting",
"type": "text"
}
]
},
{
"bbox": [
311,
704,
563,
713
],
"spans": [
{
"bbox": [
311,
704,
562,
713
],
"score": 0.98,
"content": "matches based more heavily upon the auxiliary cost, which",
"type": "text"
}
]
},
{
"bbox": [
311,
716,
563,
725
],
"spans": [
{
"bbox": [
311,
716,
563,
725
],
"score": 0.97,
"content": "is essentially a much more stable running average of the cost",
"type": "text"
}
]
}
]
}
],
"layout_bboxes": [
{
"layout_bbox": [
47,
55,
301,
726
],
"layout_label": "V",
"sub_layout": []
},
{
"layout_bbox": [
310,
55,
564,
726
],
"layout_label": "V",
"sub_layout": []
}
],
"page_idx": 0,
"page_size": [
612.0,
792.0
],
"_layout_tree": [
{
"layout_bbox": [
0,
55,
612.0,
726
],
"layout_label": "V",
"sub_layout": [
{
"layout_bbox": [
47,
55,
564,
726
],
"layout_label": "H",
"sub_layout": [
{
"layout_bbox": [
47,
55,
301,
726
],
"layout_label": "V",
"sub_layout": []
},
{
"layout_bbox": [
310,
55,
564,
726
],
"layout_label": "V",
"sub_layout": []
}
]
}
]
}
],
"images": [
{
"type": "image",
"bbox": [
47,
100,
301,
535
],
"blocks": [
{
"bbox": [
51,
100,
292,
484
],
"type": "image_body",
"lines": [
{
"bbox": [
51,
100,
292,
484
],
"spans": [
{
"bbox": [
51,
100,
292,
484
],
"score": 0.9999815225601196,
"type": "image",
"image_path": "b07d74524eac6f46b5505b48b1e10db23f2b45cb2d21d5fec72e967e61255811.jpg"
}
]
}
]
},
{
"bbox": [
47,
488,
301,
535
],
"type": "image_caption",
"lines": [
{
"bbox": [
49,
490,
299,
499
],
"spans": [
{
"bbox": [
49,
490,
299,
499
],
"score": 1.0,
"content": "Figure2:Twosampleframesfromthesyntheticvideose-",
"type": "text"
}
]
},
{
"bbox": [
48,
501,
300,
512
],
"spans": [
{
"bbox": [
48,
501,
300,
512
],
"score": 1.0,
"content": "quence (1st row), along with their corresponding ground truth",
"type": "text"
}
]
},
{
"bbox": [
48,
513,
299,
523
],
"spans": [
{
"bbox": [
48,
513,
299,
523
],
"score": 0.98,
"content": "disparity (2nd row), occlusion map (3rd row), and discontinuity",
"type": "text"
}
]
},
{
"bbox": [
48,
525,
110,
535
],
"spans": [
{
"bbox": [
48,
525,
110,
535
],
"score": 0.99,
"content": "map (4th row).",
"type": "text"
}
]
}
]
}
]
},
{
"type": "image",
"bbox": [
310,
55,
564,
371
],
"blocks": [
{
"bbox": [
314,
55,
538,
305
],
"type": "image_body",
"lines": [
{
"bbox": [
314,
55,
538,
305
],
"spans": [
{
"bbox": [
314,
55,
538,
305
],
"score": 0.9999905824661255,
"type": "image",
"image_path": "c7539af438972442d0f86aa46409e6684338ddfd1fbfd6bdacf02220853ccb55.jpg"
}
]
}
]
},
{
"bbox": [
310,
311,
564,
371
],
"type": "image_caption",
"lines": [
{
"bbox": [
312,
313,
562,
322
],
"spans": [
{
"bbox": [
312,
313,
562,
322
],
"score": 0.97,
"content": "Figure 3: Performance of temporal matching at different levels",
"type": "text"
}
]
},
{
"bbox": [
312,
325,
561,
334
],
"spans": [
{
"bbox": [
312,
325,
561,
334
],
"score": 0.98,
"content": "of uniformly distributed image noise{±0,±20,±40}.Mean",
"type": "text"
}
]
},
{
"bbox": [
311,
336,
563,
347
],
"spans": [
{
"bbox": [
311,
336,
563,
347
],
"score": 0.99,
"content": "squared error (MSE) of disparities is plotted versus the values",
"type": "text"
}
]
},
{
"bbox": [
311,
348,
561,
358
],
"spans": [
{
"bbox": [
311,
348,
561,
358
],
"score": 0.96,
"content": "of the feedback coefficient X. Dashed lines correspond to the",
"type": "text"
}
]
},
{
"bbox": [
311,
360,
535,
371
],
"spans": [
{
"bbox": [
311,
360,
535,
371
],
"score": 0.96,
"content": "values of MSE obtained without temporal aggregation.",
"type": "text"
}
]
}
]
}
]
},
{
"type": "image",
"bbox": [
310,
418,
563,
666
],
"blocks": [
{
"bbox": [
314,
418,
549,
623
],
"type": "image_body",
"lines": [
{
"bbox": [
314,
418,
549,
623
],
"spans": [
{
"bbox": [
314,
418,
549,
623
],
"score": 0.9999067783355713,
"type": "image",
"image_path": "9ac4db9197801de4a20dbc9ea17bc0c53afb7290dc8b5b45d9e92e830566cb14.jpg"
}
]
}
]
},
{
"bbox": [
310,
630,
563,
666
],
"type": "image_caption",
"lines": [
{
"bbox": [
312,
631,
562,
641
],
"spans": [
{
"bbox": [
312,
631,
562,
641
],
"score": 0.94,
"content": "Figure 4:Optimal values of the feedback coefficient \\ cor-",
"type": "text"
}
]
},
{
"bbox": [
312,
644,
561,
652
],
"spans": [
{
"bbox": [
312,
644,
561,
652
],
"score": 0.97,
"content": "responding to the smallest mean squared error (MSE)of the",
"type": "text"
}
]
},
{
"bbox": [
312,
655,
513,
665
],
"spans": [
{
"bbox": [
312,
655,
513,
665
],
"score": 0.97,
"content": "disparity estimates for a range of noise strengths.",
"type": "text"
}
]
}
]
}
]
}
],
"tables": [],
"interline_equations": [],
"discarded_blocks": [],
"need_drop": false,
"drop_reason": [],
"para_blocks": [
{
"type": "text",
"bbox": [
47,
57,
299,
93
],
"lines": [
{
"bbox": [
47,
57,
299,
68
],
"spans": [
{
"bbox": [
49,
57,
298,
68
],
"score": 0.98,
"content": "of the synthetic stereo scene from a single camera perspective",
"type": "text"
}
]
},
{
"bbox": [
47,
71,
299,
80
],
"spans": [
{
"bbox": [
49,
71,
299,
80
],
"score": 0.96,
"content": "along with the ground truth disparity,occlusion map,and",
"type": "text"
}
]
},
{
"bbox": [
47,
82,
123,
93
],
"spans": [
{
"bbox": [
49,
82,
123,
93
],
"score": 0.99,
"content": "discontinuitymap.",
"type": "text"
}
]
}
]
},
{
"type": "image",
"bbox": [
47,
100,
301,
535
],
"blocks": [
{
"bbox": [
51,
100,
292,
484
],
"type": "image_body",
"lines": [
{
"bbox": [
51,
100,
292,
484
],
"spans": [
{
"bbox": [
51,
100,
292,
484
],
"score": 0.9999815225601196,
"type": "image",
"image_path": "b07d74524eac6f46b5505b48b1e10db23f2b45cb2d21d5fec72e967e61255811.jpg"
}
]
}
]
},
{
"bbox": [
47,
488,
301,
535
],
"type": "image_caption",
"lines": [
{
"bbox": [
49,
490,
299,
499
],
"spans": [
{
"bbox": [
49,
490,
299,
499
],
"score": 1.0,
"content": "Figure2:Twosampleframesfromthesyntheticvideose-",
"type": "text"
}
]
},
{
"bbox": [
48,
501,
300,
512
],
"spans": [
{
"bbox": [
48,
501,
300,
512
],
"score": 1.0,
"content": "quence (1st row), along with their corresponding ground truth",
"type": "text"
}
]
},
{
"bbox": [
48,
513,
299,
523
],
"spans": [
{
"bbox": [
48,
513,
299,
523
],
"score": 0.98,
"content": "disparity (2nd row), occlusion map (3rd row), and discontinuity",
"type": "text"
}
]
},
{
"bbox": [
48,
525,
110,
535
],
"spans": [
{
"bbox": [
48,
525,
110,
535
],
"score": 0.99,
"content": "map (4th row).",
"type": "text"
}
]
}
]
}
]
},
{
"type": "text",
"bbox": [
47,
549,
299,
678
],
"lines": [
{
"bbox": [
58,
549,
299,
558
],
"spans": [
{
"bbox": [
58,
549,
298,
558
],
"score": 0.98,
"content": "Theresultsof temporalstereomatching aregiveninFigure",
"type": "text"
}
]
},
{
"bbox": [
47,
561,
299,
570
],
"spans": [
{
"bbox": [
47,
561,
298,
570
],
"score": 0.98,
"content": "3foruniformadditivenoiseconfinedtotherangesof±O",
"type": "text"
}
]
},
{
"bbox": [
47,
573,
299,
582
],
"spans": [
{
"bbox": [
49,
573,
299,
582
],
"score": 0.96,
"content": "±20, and ±40. Each performance plot is given as a function",
"type": "text"
}
]
},
{
"bbox": [
47,
585,
299,
594
],
"spans": [
{
"bbox": [
48,
585,
299,
594
],
"score": 0.95,
"content": "of the feedback coefficient X. As with the majority of temporal",
"type": "text"
}
]
},
{
"bbox": [
47,
597,
299,
606
],
"spans": [
{
"bbox": [
49,
597,
299,
606
],
"score": 0.99,
"content": "stereomatching methods,improvements are negligible when",
"type": "text"
}
]
},
{
"bbox": [
47,
609,
299,
618
],
"spans": [
{
"bbox": [
48,
609,
299,
618
],
"score": 0.97,
"content": "no noise is added to the images [1o], [19]. This is largely due",
"type": "text"
}
]
},
{
"bbox": [
47,
621,
299,
629
],
"spans": [
{
"bbox": [
48,
621,
299,
629
],
"score": 1.0,
"content": "tothefactthatthevideousedtoevaluatethesemethodsis",
"type": "text"
}
]
},
{
"bbox": [
47,
633,
299,
641
],
"spans": [
{
"bbox": [
48,
633,
299,
641
],
"score": 1.0,
"content": "computergeneratedwithverylittlenoisetostartwith,thus",
"type": "text"
}
]
},
{
"bbox": [
47,
644,
299,
654
],
"spans": [
{
"bbox": [
48,
644,
299,
654
],
"score": 0.98,
"content": "the noise suppression achieved with temporal stereo matching",
"type": "text"
}
]
},
{
"bbox": [
47,
657,
299,
666
],
"spans": [
{
"bbox": [
48,
657,
299,
666
],
"score": 0.98,
"content": "showslittletonoimprovementovermethodsthatoperate on",
"type": "text"
}
]
},
{
"bbox": [
47,
669,
113,
678
],
"spans": [
{
"bbox": [
48,
669,
113,
678
],
"score": 1.0,
"content": "pairsofimages.",
"type": "text"
}
]
}
]
},
{
"type": "text",
"bbox": [
47,
680,
299,
725
],
"lines": [
{
"bbox": [
58,
680,
299,
690
],
"spans": [
{
"bbox": [
59,
680,
298,
690
],
"score": 0.97,
"content": "Significantimprovementsin accuracy canbeseenin Figure",
"type": "text"
}
]
},
{
"bbox": [
47,
692,
299,
701
],
"spans": [
{
"bbox": [
48,
692,
298,
701
],
"score": 0.97,
"content": "3 when the noise has ranges of ±20, and ±40.In this scenario",
"type": "text"
}
]
},
{
"bbox": [
47,
703,
299,
714
],
"spans": [
{
"bbox": [
48,
703,
299,
714
],
"score": 0.98,
"content": "the effect of noise in the current frame is reduced by increasing",
"type": "text"
}
]
},
{
"bbox": [
47,
716,
299,
725
],
"spans": [
{
"bbox": [
48,
716,
299,
725
],
"score": 0.96,
"content": "thefeedbackcoefficientX.Thisincreasing ofXhas theeffect",
"type": "text"
}
]
}
]
},
{
"type": "image",
"bbox": [
310,
55,
564,
371
],
"blocks": [
{
"bbox": [
314,
55,
538,
305
],
"type": "image_body",
"lines": [
{
"bbox": [
314,
55,
538,
305
],
"spans": [
{
"bbox": [
314,
55,
538,
305
],
"score": 0.9999905824661255,
"type": "image",
"image_path": "c7539af438972442d0f86aa46409e6684338ddfd1fbfd6bdacf02220853ccb55.jpg"
}
]
}
]
},
{
"bbox": [
310,
311,
564,
371
],
"type": "image_caption",
"lines": [
{
"bbox": [
312,
313,
562,
322
],
"spans": [
{
"bbox": [
312,
313,
562,
322
],
"score": 0.97,
"content": "Figure 3: Performance of temporal matching at different levels",
"type": "text"
}
]
},
{
"bbox": [
312,
325,
561,
334
],
"spans": [
{
"bbox": [
312,
325,
561,
334
],
"score": 0.98,
"content": "of uniformly distributed image noise{±0,±20,±40}.Mean",
"type": "text"
}
]
},
{
"bbox": [
311,
336,
563,
347
],
"spans": [
{
"bbox": [
311,
336,
563,
347
],
"score": 0.99,
"content": "squared error (MSE) of disparities is plotted versus the values",
"type": "text"
}
]
},
{
"bbox": [
311,
348,
561,
358
],
"spans": [
{
"bbox": [
311,
348,
561,
358
],
"score": 0.96,
"content": "of the feedback coefficient X. Dashed lines correspond to the",
"type": "text"
}
]
},
{
"bbox": [
311,
360,
535,
371
],
"spans": [
{
"bbox": [
311,
360,
535,
371
],
"score": 0.96,
"content": "values of MSE obtained without temporal aggregation.",
"type": "text"
}
]
}
]
}
]
},
{
"type": "image",
"bbox": [
310,
418,
563,
666
],
"blocks": [
{
"bbox": [
314,
418,
549,
623
],
"type": "image_body",
"lines": [
{
"bbox": [
314,
418,
549,
623
],
"spans": [
{
"bbox": [
314,
418,
549,
623
],
"score": 0.9999067783355713,
"type": "image",
"image_path": "9ac4db9197801de4a20dbc9ea17bc0c53afb7290dc8b5b45d9e92e830566cb14.jpg"
}
]
}
]
},
{
"bbox": [
310,
630,
563,
666
],
"type": "image_caption",
"lines": [
{
"bbox": [
312,
631,
562,
641
],
"spans": [
{
"bbox": [
312,
631,
562,
641
],
"score": 0.94,
"content": "Figure 4:Optimal values of the feedback coefficient \\ cor-",
"type": "text"
}
]
},
{
"bbox": [
312,
644,
561,
652
],
"spans": [
{
"bbox": [
312,
644,
561,
652
],
"score": 0.97,
"content": "responding to the smallest mean squared error (MSE)of the",
"type": "text"
}
]
},
{
"bbox": [
312,
655,
513,
665
],
"spans": [
{
"bbox": [
312,
655,
513,
665
],
"score": 0.97,
"content": "disparity estimates for a range of noise strengths.",
"type": "text"
}
]
}
]
}
]
},
{
"type": "text",
"bbox": [
311,
692,
563,
725
],
"lines": [
{
"bbox": [
311,
692,
563,
702
],
"spans": [
{
"bbox": [
311,
692,
562,
702
],
"score": 0.95,
"content": "of averaging out noise in the per-pixel costs by selecting",
"type": "text"
}
]
},
{
"bbox": [
311,
704,
563,
713
],
"spans": [
{
"bbox": [
311,
704,
562,
713
],
"score": 0.98,
"content": "matches based more heavily upon the auxiliary cost, which",
"type": "text"
}
]
},
{
"bbox": [
311,
716,
563,
725
],
"spans": [
{
"bbox": [
311,
716,
563,
725
],
"score": 0.97,
"content": "is essentially a much more stable running average of the cost",
"type": "text"
}
]
}
]
}
]
}
],
"_parse_type": "ocr",
"_version_name": "0.7.0b1"
}
import json
import os
import shutil
import tempfile
from magic_pdf.integrations.rag.api import DataReader, RagDocumentReader
from magic_pdf.integrations.rag.type import CategoryType
from magic_pdf.integrations.rag.utils import \
convert_middle_json_to_layout_elements
def test_rag_document_reader():
# setup
unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
os.makedirs(temp_output_dir, exist_ok=True)
# test
with open('tests/test_integrations/test_rag/assets/middle.json') as f:
json_data = json.load(f)
res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)
doc = RagDocumentReader(res)
assert len(list(iter(doc))) == 1
page = list(iter(doc))[0]
assert len(list(iter(page))) == 10
assert len(page.get_rel_map()) == 3
item = list(iter(page))[0]
assert item.category_type == CategoryType.text
# teardown
shutil.rmtree(temp_output_dir)
def test_data_reader():
# setup
unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
os.makedirs(temp_output_dir, exist_ok=True)
# test
data_reader = DataReader('tests/test_integrations/test_rag/assets', 'ocr',
temp_output_dir)
assert data_reader.get_documents_count() == 2
for idx in range(data_reader.get_documents_count()):
document = data_reader.get_document_result(idx)
assert document is not None
# teardown
shutil.rmtree(temp_output_dir)
import json
import os
import shutil
import tempfile
from magic_pdf.integrations.rag.type import CategoryType
from magic_pdf.integrations.rag.utils import (
convert_middle_json_to_layout_elements, inference)
def test_convert_middle_json_to_layout_elements():
# setup
unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
os.makedirs(temp_output_dir, exist_ok=True)
# test
with open('tests/test_integrations/test_rag/assets/middle.json') as f:
json_data = json.load(f)
res = convert_middle_json_to_layout_elements(json_data, temp_output_dir)
assert len(res) == 1
assert len(res[0].layout_dets) == 10
assert res[0].layout_dets[0].anno_id == 0
assert res[0].layout_dets[0].category_type == CategoryType.text
assert len(res[0].extra.element_relation) == 3
# teardown
shutil.rmtree(temp_output_dir)
def test_inference():
asset_dir = 'tests/test_integrations/test_rag/assets'
# setup
unitest_dir = '/tmp/magic_pdf/unittest/integrations/rag'
os.makedirs(unitest_dir, exist_ok=True)
temp_output_dir = tempfile.mkdtemp(dir=unitest_dir)
os.makedirs(temp_output_dir, exist_ok=True)
# test
res = inference(
asset_dir + '/one_page_with_table_image.pdf',
temp_output_dir,
'ocr',
)
assert res is not None
assert len(res) == 1
assert len(res[0].layout_dets) == 10
assert res[0].layout_dets[0].anno_id == 0
assert res[0].layout_dets[0].category_type == CategoryType.text
assert len(res[0].extra.element_relation) == 3
# teardown
shutil.rmtree(temp_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment