test_read_api.py 2.45 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os

import pytest

from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.read_api import (read_jsonl, read_local_images,
                                     read_local_pdfs)
from magic_pdf.data.schemas import S3Config


def test_read_local_pdfs():
    datasets = read_local_pdfs('tests/test_data/assets/pdfs')
    assert len(datasets) == 2
    assert len(datasets[0]) > 0
    assert len(datasets[1]) > 0

    assert datasets[0].get_page(0).get_page_info().w > 0
    assert datasets[0].get_page(0).get_page_info().h > 0


def test_read_local_images():
    datasets = read_local_images('tests/test_data/assets/pngs', suffixes=['png'])
    assert len(datasets) == 2
    assert len(datasets[0]) == 1
    assert len(datasets[1]) == 1

    assert datasets[0].get_page(0).get_page_info().w > 0
    assert datasets[0].get_page(0).get_page_info().h > 0


@pytest.mark.skipif(
    os.getenv('S3_ACCESS_KEY_2', None) is None, reason='need s3 config!'
)
def test_read_json():
    """test multi bucket s3 reader writer must config s3 config in the
    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx.

    export S3_BUCKET_2=xxx export S3_ACCESS_KEY_2=xxx export S3_SECRET_KEY_2=xxx export S3_ENDPOINT_2=xxx
    """
    bucket = os.getenv('S3_BUCKET', '')
    ak = os.getenv('S3_ACCESS_KEY', '')
    sk = os.getenv('S3_SECRET_KEY', '')
    endpoint_url = os.getenv('S3_ENDPOINT', '')

    bucket_2 = os.getenv('S3_BUCKET_2', '')
    ak_2 = os.getenv('S3_ACCESS_KEY_2', '')
    sk_2 = os.getenv('S3_SECRET_KEY_2', '')
    endpoint_url_2 = os.getenv('S3_ENDPOINT_2', '')

    s3configs = [
        S3Config(
            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
        ),
        S3Config(
            bucket_name=bucket_2,
            access_key=ak_2,
            secret_key=sk_2,
            endpoint_url=endpoint_url_2,
        ),
    ]

    reader = MultiBucketS3DataReader(bucket, s3configs)

    datasets = read_jsonl(
        f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl',
        reader,
    )
    assert len(datasets) > 0
    assert len(datasets[0]) == 10

    datasets = read_jsonl('tests/test_data/assets/jsonl/test_01.jsonl', reader)
    assert len(datasets) == 1
    assert len(datasets[0]) == 10

    datasets = read_jsonl('tests/test_data/assets/jsonl/test_02.jsonl')
    assert len(datasets) == 1
    assert len(datasets[0]) == 1