Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
3cd51d49
Commit
3cd51d49
authored
Dec 09, 2024
by
xu rui
Browse files
feat: rewrite code snippet
parent
6ca86bea
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
198 additions
and
142 deletions
+198
-142
next_docs/en/user_guide/data/data_reader_writer.rst
next_docs/en/user_guide/data/data_reader_writer.rst
+89
-61
next_docs/en/user_guide/data/read_api.rst
next_docs/en/user_guide/data/read_api.rst
+2
-2
next_docs/zh_cn/user_guide/data/data_reader_writer.rst
next_docs/zh_cn/user_guide/data/data_reader_writer.rst
+103
-75
next_docs/zh_cn/user_guide/data/read_api.rst
next_docs/zh_cn/user_guide/data/read_api.rst
+4
-4
No files found.
next_docs/en/user_guide/data/data_reader_writer.rst
View file @
3cd51d49
...
...
@@ -87,56 +87,70 @@ Read Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# file based related
# file based related
file_based_reader1 = FileBasedDataReader('')
## will read file abc
file_based_reader1.read('abc')
## will read file abc
file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp')
## will read /tmp/abc
file_based_reader2.read('abc')
## will read /
var
/logs/message.txt
file_based_reader2.read('/
var
/logs/message.txt')
## will read /
tmp
/logs/message.txt
file_based_reader2.read('/
tmp
/logs/message.txt')
# multi bucket s3 releated
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=
test_
bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will read s3://
test_
bucket
1/
test_prefix/abc
## will read s3://
{
bucket
}/{
test_prefix
}
/abc
multi_bucket_s3_reader1.read('abc')
## will read s3://
test_
bucket
1
/efg
multi_bucket_s3_reader1.read('s3://
test_bucket1
/efg')
## will read s3://
{
bucket
}/{test_prefix}
/efg
multi_bucket_s3_reader1.read(
f
's3://
{bucket}/{test_prefix}
/efg')
## will read s3://
test_
bucket2/abc
multi_bucket_s3_reader1.read('s3://
test_
bucket
2
/abc')
## will read s3://
{
bucket2
}/{test_prefix}
/abc
multi_bucket_s3_reader1.read(
f
's3://
{
bucket
_2}/{test_prefix}
/abc')
# s3 related
s3_reader1 = S3DataReader(
"
test_prefix
"
,
"test_
bucket
"
,
"
ak
"
,
"
sk
"
,
"localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
## will read s3://
test_
bucket
/
test_prefix/abc
## will read s3://
{
bucket
}/{
test_prefix
}
/abc
s3_reader1.read('abc')
## will read s3://
test_
bucket/efg
s3_reader1.read('s3://
test_
bucket/efg')
## will read s3://
{
bucket
}
/efg
s3_reader1.read(
f
's3://
{
bucket
}
/efg')
Write Examples
...
...
@@ -144,65 +158,79 @@ Write Examples
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# file based related
file_based_writer1 = FileBasedDataWriter(
''
)
# file based related
file_based_writer1 = FileBasedDataWriter(
""
)
## will write 123 to abc
file_based_writer1.write(
'
abc
'
,
'
123
'
.encode())
file_based_writer1.write(
"
abc
"
,
"
123
"
.encode())
## will write 123 to abc
file_based_writer1.write_string(
'
abc
'
,
'
123
')
file_based_writer1.write_string(
"
abc
"
,
"
123
")
file_based_writer2 = FileBasedDataWriter(
'
/tmp
'
)
file_based_writer2 = FileBasedDataWriter(
"
/tmp
"
)
## will write 123 to /tmp/abc
file_based_writer2.write_string(
'
abc
'
,
'
123
'
)
file_based_writer2.write_string(
"
abc
"
,
"
123
"
)
## will write 123 to /
var
/logs/message.txt
file_based_writer2.write_string(
'/var
/logs/message.txt
'
,
'
123
'
)
## will write 123 to /
tmp
/logs/message.txt
file_based_writer2.write_string(
"/tmp
/logs/message.txt
"
,
"
123
"
)
# multi bucket s3 releated
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## will write 123 to s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
bucket = "bucket" # replace with real bucket
ak = "ak" # replace with real access key
sk = "sk" # replace with real secret key
endpoint_url = "endpoint_url" # replace with real endpoint_url
bucket_2 = "bucket_2" # replace with real bucket
ak_2 = "ak_2" # replace with real access key
sk_2 = "sk_2" # replace with real secret key
endpoint_url_2 = "endpoint_url_2" # replace with real endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
## will write 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
## will write 123 to s3://
test_
bucket
1/
test_prefix/abc
multi_bucket_s3_writer1.write(
'
abc
'
,
'
123
'
.encode())
## will write 123 to s3://
{
bucket
}/{
test_prefix
}
/abc
multi_bucket_s3_writer1.write(
"
abc
"
,
"
123
"
.encode())
## will write 123 to s3://
test_
bucket
1
/efg
multi_bucket_s3_writer1.write(
'
s3://
test_bucket1
/efg
'
,
'
123
'
.encode())
## will write 123 to s3://
{
bucket
}/{test_prefix}
/efg
multi_bucket_s3_writer1.write(
f"
s3://
{bucket}/{test_prefix}
/efg
"
,
"
123
"
.encode())
## will write 123 to s3://
test_
bucket
2
/abc
multi_bucket_s3_writer1.write('s3://
test_
bucket
2
/abc', '123'.encode())
## will write 123 to s3://
{
bucket
_2}/{test_prefix}
/abc
multi_bucket_s3_writer1.write(
f
's3://
{
bucket
_2}/{test_prefix}
/abc', '123'.encode())
# s3 related
s3_writer1 = S3DataWriter(
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
)
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
## will write 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write("abc", "123".encode())
## will write 123 to s3://
test_
bucket
/
test_prefix/abc
s3_writer1.write
('
abc
'
,
'
123
'.encode()
)
## will write 123 to s3://
{
bucket
}/{
test_prefix
}
/abc
s3_writer1.write
_string("
abc
"
,
"
123
"
)
## will write 123 to s3://
test_
bucket
/test_prefix/abc
s3_writer1.write
_string('abc'
,
'
123
'
)
## will write 123 to s3://
{
bucket
}/efg
s3_writer1.write
(f"s3://{bucket}/efg"
,
"
123
".encode()
)
## will write 123 to s3://test_bucket/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode())
Check :doc:`../../api/data_reader_writer` for more details
next_docs/en/user_guide/data/read_api.rst
View file @
3cd51d49
...
...
@@ -80,10 +80,10 @@ Read images from path or directory
from magic_pdf.data.read_api import *
# read from image path
datasets = read_local_images("tt.png")
datasets = read_local_images("tt.png")
# replace with real file path
# read files from directory that endswith suffix in suffixes array
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_images("images/", suffixes=["png", "jpg"])
# replace with real directory
Check :doc:`../../api/read_api` for more details
\ No newline at end of file
next_docs/zh_cn/user_guide/data/data_reader_writer.rst
View file @
3cd51d49
...
...
@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
---------
.. code:: python
from magic_pdf.data.data_reader_writer import *
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
#
文件相关的
#
初始化 reader
file_based_reader1 = FileBasedDataReader('')
##
将读取
文件 abc
file_based_reader1.read('abc')
##
读本地
文件 abc
file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp')
##
将读取
/tmp/abc
##
读本地文件
/tmp/abc
file_based_reader2.read('abc')
## 将读取 /var/logs/message.txt
file_based_reader2.read('/var/logs/message.txt')
## 读本地文件 /tmp/logs/message.txt
file_based_reader2.read('/tmp/logs/message.txt')
# 初始化多桶 s3 reader
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
# 多桶 S3 相关的
multi_bucket_s3_reader1 = MultiBucketS3DataReader(
"test_
bucket
1/
test_prefix",
list
[S3Config(
bucket_name=
test_
bucket
1
, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader(
f"{
bucket
}/{
test_prefix
}
", [S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=
test_
bucket_2,
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
##
将读取
s3://
test_
bucket
1/
test_prefix/abc
##
读文件
s3://
{
bucket
}/{
test_prefix
}
/abc
multi_bucket_s3_reader1.read('abc')
##
将读取
s3://
test_
bucket
1
/efg
multi_bucket_s3_reader1.read('s3://
test_bucket1
/efg')
##
读文件
s3://
{
bucket
}/{test_prefix}
/efg
multi_bucket_s3_reader1.read(
f
's3://
{bucket}/{test_prefix}
/efg')
##
将读取
s3://
test_
bucket2/abc
multi_bucket_s3_reader1.read('s3://
test_
bucket
2
/abc')
##
读文件
s3://
{
bucket2
}/{test_prefix}
/abc
multi_bucket_s3_reader1.read(
f
's3://
{
bucket
_2}/{test_prefix}
/abc')
#
S3 相关的
#
初始化 s3 reader
s3_reader1 = S3DataReader(
"
test_prefix
"
,
"test_
bucket
"
,
"
ak
"
,
"
sk
"
,
"localhost"
test_prefix,
bucket,
ak,
sk,
endpoint_url
)
##
将读取
s3://
test_
bucket
/
test_prefix/abc
##
读文件
s3://
{
bucket
}/{
test_prefix
}
/abc
s3_reader1.read('abc')
## 将读取 s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg')
## 读文件 s3://{bucket}/efg
s3_reader1.read(f's3://{bucket}/efg')
写入示例
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# 初始化 reader
file_based_writer1 = FileBasedDataWriter("")
## 写数据 123 to abc
file_based_writer1.write("abc", "123".encode())
## 写数据 123 to abc
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter("/tmp")
## 写数据 123 to /tmp/abc
file_based_writer2.write_string("abc", "123")
## 写数据 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# 初始化多桶 s3 writer
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
# 文件相关的
file_based_writer1 = FileBasedDataWriter('')
## 将写入 123 到 abc
file_based_writer1.write('abc', '123'.encode())
## 将写入 123 到 abc
file_based_writer1.write_string('abc', '123')
file_based_writer2 = FileBasedDataWriter('/tmp')
## 将写入 123 到 /tmp/abc
file_based_writer2.write_string('abc', '123')
## 将写入 123 到 /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
# 多桶 S3 相关的
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## 将写入 123 到 s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
## 写数据 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write_string("abc", "123")
##
将写入
123
到
s3://
test_
bucket
1/
test_prefix/abc
multi_bucket_s3_writer1.write(
'
abc
'
,
'
123
'
.encode())
##
写数据
123
to
s3://
{
bucket
}/{
test_prefix
}
/abc
multi_bucket_s3_writer1.write(
"
abc
"
,
"
123
"
.encode())
##
将写入
123
到
s3://
test_bucket1
/efg
multi_bucket_s3_writer1.write(
'
s3://
test_bucket1
/efg
'
,
'
123
'
.encode())
##
写数据
123
to
s3://
{bucket}/{test_prefix}
/efg
multi_bucket_s3_writer1.write(
f"
s3://
{bucket}/{test_prefix}
/efg
"
,
"
123
"
.encode())
##
将写入
123
到
s3://
test_
bucket
2
/abc
multi_bucket_s3_writer1.write('s3://
test_
bucket
2
/abc', '123'.encode())
##
写数据
123
to
s3://
{
bucket
_2}/{test_prefix}
/abc
multi_bucket_s3_writer1.write(
f
's3://
{
bucket
_2}/{test_prefix}
/abc', '123'.encode())
# S3 相关的
s3_writer1 = S3DataWriter(
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
)
# 初始化 s3 writer
s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
##
将写入
123
到
s3://
test_
bucket
/
test_prefix/abc
s3_writer1.write(
'
abc
'
,
'
123
'
.encode())
##
写数据
123
to
s3://
{
bucket
}/{
test_prefix
}
/abc
s3_writer1.write(
"
abc
"
,
"
123
"
.encode())
##
将写入
123
到
s3://
test_
bucket
/
test_prefix/abc
s3_writer1.write_string(
'
abc
'
,
'
123
'
)
##
写数据
123
to
s3://
{
bucket
}/{
test_prefix
}
/abc
s3_writer1.write_string(
"
abc
"
,
"
123
"
)
##
将写入
123
到
s3://
test_
bucket/efg
s3_writer1.write(
'
s3://
test_
bucket/efg
'
,
'
123
'
.encode())
##
写数据
123
to
s3://
{
bucket
}
/efg
s3_writer1.write(
f"
s3://
{
bucket
}
/efg
"
,
"
123
"
.encode())
next_docs/zh_cn/user_guide/data/read_api.rst
View file @
3cd51d49
...
...
@@ -61,10 +61,10 @@ read_local_pdfs
from magic_pdf.data.read_api import *
# 读取 PDF 路径
datasets = read_local_pdfs("tt.pdf")
datasets = read_local_pdfs("tt.pdf")
# 替换为有效的文件
# 读取目录下的 PDF 文件
datasets = read_local_pdfs("pdfs/")
datasets = read_local_pdfs("pdfs/")
# 替换为有效的文件目录
read_local_images
^^^^^^^^^^^^^^^^^^^
...
...
@@ -76,7 +76,7 @@ read_local_images
from magic_pdf.data.read_api import *
# 从图像路径读取
datasets = read_local_images("tt.png")
datasets = read_local_images("tt.png")
# 替换为有效的文件
# 从目录读取以 suffixes 数组中指定后缀结尾的文件
datasets = read_local_images("images/", suffixes=["png", "jpg"])
datasets = read_local_images("images/", suffixes=["png", "jpg"])
# 替换为有效的文件目录
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment