test_s3.py 3.38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
import json
import os

import pytest

from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter


@pytest.mark.skipif(
    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
icecraft's avatar
icecraft committed
12
def test_s3_reader_writer():
13
14
15
16
17
18
19
20
    """test multi bucket s3 reader writer must config s3 config in the
    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
    bucket = os.getenv('S3_BUCKET', '')
    ak = os.getenv('S3_ACCESS_KEY', '')
    sk = os.getenv('S3_SECRET_KEY', '')
    endpoint_url = os.getenv('S3_ENDPOINT', '')

icecraft's avatar
icecraft committed
21
22
    reader = S3DataReader('', bucket, ak, sk, endpoint_url)
    writer = S3DataWriter('', bucket, ak, sk, endpoint_url)
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53

    bits = reader.read('meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl')

    assert bits == reader.read(
        f's3://{bucket}/meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl'
    )

    bits = reader.read(
        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
    )
    assert bits == reader.read_at(
        'meta-index/scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
    )
    assert len(json.loads(bits)) > 0

    writer.write_string(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
    )

    assert 'abc'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
    )

    writer.write(
        f'{bucket}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
        '123'.encode(),
    )

    assert '123'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
    )
icecraft's avatar
icecraft committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106


@pytest.mark.skipif(
    os.getenv('S3_ACCESS_KEY', None) is None, reason='need s3 config!'
)
def test_s3_reader_writer_with_prefix():
    """test multi bucket s3 reader writer must config s3 config in the
    environment export S3_BUCKET=xxx export S3_ACCESS_KEY=xxx export
    S3_SECRET_KEY=xxx export S3_ENDPOINT=xxx."""
    bucket = os.getenv('S3_BUCKET', '')
    ak = os.getenv('S3_ACCESS_KEY', '')
    sk = os.getenv('S3_SECRET_KEY', '')
    endpoint_url = os.getenv('S3_ENDPOINT', '')

    prefix = 'meta-index'

    reader = S3DataReader(prefix, bucket, ak, sk, endpoint_url)
    writer = S3DataWriter(prefix, bucket, ak, sk, endpoint_url)

    bits = reader.read('scihub/v001/scihub/part-66210c190659-000026.jsonl')

    assert bits == reader.read(
        f's3://{bucket}/{prefix}/scihub/v001/scihub/part-66210c190659-000026.jsonl'
    )

    bits = reader.read(
        'scihub/v001/scihub/part-66210c190659-000026.jsonl?bytes=566,713'
    )
    assert bits == reader.read_at(
        'scihub/v001/scihub/part-66210c190659-000026.jsonl', 566, 713
    )
    assert len(json.loads(bits)) > 0

    writer.write_string(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt', 'abc'
    )

    assert 'abc'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
    )

    assert 'abc'.encode() == reader.read(
        f's3://{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test01.txt'
    )

    writer.write(
        f'{bucket}/{prefix}/unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt',
        '123'.encode(),
    )

    assert '123'.encode() == reader.read(
        'unittest/data/data_reader_writer/multi_bucket_s3_data/test02.txt'
    )