s3.py 3.52 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import boto3
from botocore.config import Config

from magic_pdf.data.io.base import IOReader, IOWriter


class S3Reader(IOReader):
    def __init__(
        self,
        bucket: str,
        ak: str,
        sk: str,
        endpoint_url: str,
        addressing_style: str = 'auto',
    ):
        """s3 reader client.

        Args:
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
            endpoint_url (str): endpoint url of s3
            addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        self._bucket = bucket
        self._ak = ak
        self._sk = sk
        self._s3_client = boto3.client(
            service_name='s3',
            aws_access_key_id=ak,
            aws_secret_access_key=sk,
            endpoint_url=endpoint_url,
            config=Config(
                s3={'addressing_style': addressing_style},
                retries={'max_attempts': 5, 'mode': 'standard'},
            ),
        )

    def read(self, key: str) -> bytes:
        """Read the file.

        Args:
            path (str): file path to read

        Returns:
            bytes: the content of the file
        """
        return self.read_at(key)

    def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes:
        """Read at offset and limit.

        Args:
            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
            offset (int, optional): the number of bytes skipped. Defaults to 0.
            limit (int, optional): the length of bytes want to read. Defaults to -1.

        Returns:
            bytes: the content of file
        """
        if limit > -1:
            range_header = f'bytes={offset}-{offset+limit-1}'
            res = self._s3_client.get_object(
                Bucket=self._bucket, Key=key, Range=range_header
            )
        else:
            res = self._s3_client.get_object(
                Bucket=self._bucket, Key=key, Range=f'bytes={offset}-'
            )
        return res['Body'].read()


class S3Writer(IOWriter):
    def __init__(
        self,
        bucket: str,
        ak: str,
        sk: str,
        endpoint_url: str,
        addressing_style: str = 'auto',
    ):
        """s3 reader client.

        Args:
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
            endpoint_url (str): endpoint url of s3
            addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        self._bucket = bucket
        self._ak = ak
        self._sk = sk
        self._s3_client = boto3.client(
            service_name='s3',
            aws_access_key_id=ak,
            aws_secret_access_key=sk,
            endpoint_url=endpoint_url,
            config=Config(
                s3={'addressing_style': addressing_style},
                retries={'max_attempts': 5, 'mode': 'standard'},
            ),
        )

    def write(self, key: str, data: bytes):
        """Write file with data.

        Args:
            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
            data (bytes): the data want to write
        """
        self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data)