Commit 3cd51d49 authored by xu rui's avatar xu rui
Browse files

feat: rewrite code snippet

parent 6ca86bea
......@@ -87,56 +87,70 @@ Read Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# file based related
# file based related
file_based_reader1 = FileBasedDataReader('')
## will read file abc
file_based_reader1.read('abc')
## will read file abc
file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp')
## will read /tmp/abc
file_based_reader2.read('abc')
## will read /var/logs/message.txt
file_based_reader2.read('/var/logs/message.txt')
## will read /tmp/logs/message.txt
file_based_reader2.read('/tmp/logs/message.txt')
# multi bucket s3 releated
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will read s3://test_bucket1/test_prefix/abc
## will read s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc')
## will read s3://test_bucket1/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
## will read s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## will read s3://test_bucket2/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
## will read s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# s3 related
s3_reader1 = S3DataReader(
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
## will read s3://test_bucket/test_prefix/abc
## will read s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc')
## will read s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg')
## will read s3://{bucket}/efg
s3_reader1.read(f's3://{bucket}/efg')
Write Examples
......@@ -144,65 +158,79 @@ Write Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# file based related
file_based_writer1 = FileBasedDataWriter('')
# file based related
file_based_writer1 = FileBasedDataWriter("")
## will write 123 to abc
file_based_writer1.write('abc', '123'.encode())
file_based_writer1.write("abc", "123".encode())
## will write 123 to abc
file_based_writer1.write_string('abc', '123')
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter('/tmp')
file_based_writer2 = FileBasedDataWriter("/tmp")
## will write 123 to /tmp/abc
file_based_writer2.write_string('abc', '123')
file_based_writer2.write_string("abc", "123")
## will write 123 to /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
## will write 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# multi bucket s3 releated
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write('abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket1/efg
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## will write 123 to s3://test_bucket2/abc
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
## will write 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# s3 related
s3_writer1 = S3DataWriter(
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
)
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write("abc", "123".encode())
## will write 123 to s3://test_bucket/test_prefix/abc
s3_writer1.write('abc', '123'.encode())
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write_string("abc", "123")
## will write 123 to s3://test_bucket/test_prefix/abc
s3_writer1.write_string('abc', '123')
## will write 123 to s3://{bucket}/efg
s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
## will write 123 to s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
Check :doc:`../../api/data_reader_writer` for more details
......@@ -80,10 +80,10 @@ Read images from path or directory
from magic_pdf.data.read_api import *
# read from image path
datasets = read_local_images("tt.png")
datasets = read_local_images("tt.png") # replace with real file path
# read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_images("images/", suffixes=["png", "jpg"]) # replace with real directory
Check :doc:`../../api/read_api` for more details
\ No newline at end of file
......@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
---------
.. code:: python
from magic_pdf.data.data_reader_writer import *
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# 文件相关的
# 初始化 reader
file_based_reader1 = FileBasedDataReader('')
## 将读取文件 abc
file_based_reader1.read('abc')
## 读本地文件 abc
file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp')
## 将读取 /tmp/abc
## 读本地文件 /tmp/abc
file_based_reader2.read('abc')
## 将读取 /var/logs/message.txt
file_based_reader2.read('/var/logs/message.txt')
## 读本地文件 /tmp/logs/message.txt
file_based_reader2.read('/tmp/logs/message.txt')
# 初始化多桶 s3 reader
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
# 多桶 S3 相关的
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## 将读取 s3://test_bucket1/test_prefix/abc
## 读文件 s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc')
## 将读取 s3://test_bucket1/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
## 读文件 s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## 将读取 s3://test_bucket2/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
## 读文件 s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# S3 相关的
# 初始化 s3 reader
s3_reader1 = S3DataReader(
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
## 将读取 s3://test_bucket/test_prefix/abc
## 读文件 s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc')
## 将读取 s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg')
## 读文件 s3://{bucket}/efg
s3_reader1.read(f's3://{bucket}/efg')
写入示例
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# 初始化 reader
file_based_writer1 = FileBasedDataWriter("")
## 写数据 123 to abc
file_based_writer1.write("abc", "123".encode())
## 写数据 123 to abc
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter("/tmp")
## 写数据 123 to /tmp/abc
file_based_writer2.write_string("abc", "123")
## 写数据 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# 初始化多桶 s3 writer
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
# 文件相关的
file_based_writer1 = FileBasedDataWriter('')
## 将写入 123 到 abc
file_based_writer1.write('abc', '123'.encode())
## 将写入 123 到 abc
file_based_writer1.write_string('abc', '123')
file_based_writer2 = FileBasedDataWriter('/tmp')
## 将写入 123 到 /tmp/abc
file_based_writer2.write_string('abc', '123')
## 将写入 123 到 /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
# 多桶 S3 相关的
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## 将写入 123 到 s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
## 将写入 123 s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write('abc', '123'.encode())
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write("abc", "123".encode())
## 将写入 123 s3://test_bucket1/efg
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
## 写数据 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## 将写入 123 s3://test_bucket2/abc
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# S3 相关的
s3_writer1 = S3DataWriter(
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
)
# 初始化 s3 writer
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
## 将写入 123 s3://test_bucket/test_prefix/abc
s3_writer1.write('abc', '123'.encode())
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write("abc", "123".encode())
## 将写入 123 s3://test_bucket/test_prefix/abc
s3_writer1.write_string('abc', '123')
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write_string("abc", "123")
## 将写入 123 s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
## 写数据 123 to s3://{bucket}/efg
s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
......@@ -61,10 +61,10 @@ read_local_pdfs
from magic_pdf.data.read_api import *
# 读取 PDF 路径
datasets = read_local_pdfs("tt.pdf")
datasets = read_local_pdfs("tt.pdf") # 替换为有效的文件
# 读取目录下的 PDF 文件
datasets = read_local_pdfs("pdfs/")
datasets = read_local_pdfs("pdfs/") # 替换为有效的文件目录
read_local_images
^^^^^^^^^^^^^^^^^^^
......@@ -76,7 +76,7 @@ read_local_images
from magic_pdf.data.read_api import *
# 从图像路径读取
datasets = read_local_images("tt.png")
datasets = read_local_images("tt.png") # 替换为有效的文件
# 从目录读取以 suffixes 数组中指定后缀结尾的文件
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_images("images/", suffixes=["png", "jpg"]) # 替换为有效的文件目录
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment