Unverified Commit bdacf291 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1257 from icecraft/docs/refactor_en_docs

Docs/refactor en docs
parents 2df3e901 302a6950
Convert Doc
=============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.doc -o output -m auto
API
^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_doc.doc" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert DocX
=============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.docx -o output -m auto
API
^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_docx.docx" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert Image
===============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.png -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_image.jpg" # replace with real image file
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert PDF
============
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pdf -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
Convert PPT
============
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.ppt -o output -m auto
API
^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Convert PPTX
=================
.. admonition:: Warning
:class: tip
When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
Command Line
^^^^^^^^^^^^^
.. code:: python
# make sure the file have correct suffix
magic-pdf -p a.pptx -o output -m auto
API
^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_pptx.pptx" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
...@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project ...@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
tutorial/output_file_description
tutorial/pipeline tutorial/pipeline
...@@ -28,7 +28,6 @@ Minimal Example ...@@ -28,7 +28,6 @@ Minimal Example
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir local_md_dir
) )
image_dir = str(os.path.basename(local_image_dir))
# read bytes # read bytes
reader1 = FileBasedDataReader("") reader1 = FileBasedDataReader("")
...@@ -85,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze`` ...@@ -85,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
.. admonition:: Tip .. admonition:: Tip
:class: tip :class: tip
For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators` For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`
......
Usage
========
.. toctree::
:maxdepth: 1
usage/command_line
usage/api
usage/docker
Api Usage
===========
Convert To Markdown
========================
PDF
----
Local File Example Local File Example
^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^
...@@ -113,4 +115,112 @@ S3 File Example ...@@ -113,4 +115,112 @@ S3 File Example
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3 pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
MS-Office
----------
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_office
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_ppt.ppt" # replace with real ms-office file
input_file_name = input_file.split(".")[0]
ds = read_local_office(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
Image
---------
Single Image File
^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_file = "some_image.jpg" # replace with real image file
input_file_name = input_file.split(".")[0]
ds = read_local_images(input_file)[0]
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{input_file_name}.md", image_dir
)
Directory That Contains Images
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.data.read_api import read_local_images
# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
)
# proc
## Create Dataset Instance
input_directory = "some_image_dir/" # replace with real directory that contains images
dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]
count = 0
for ds in dss:
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
md_writer, f"{count}.md", image_dir
)
count += 1
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
...@@ -10,7 +10,8 @@ Command Line ...@@ -10,7 +10,8 @@ Command Line
Options: Options:
-v, --version display the version and exit -v, --version display the version and exit
-p, --path PATH local pdf filepath or directory [required] -p, --path PATH local filepath or directory. support PDF, PPT,
PPTX, DOC, DOCX, PNG, JPG files [required]
-o, --output-dir PATH output local directory [required] -o, --output-dir PATH output local directory [required]
-m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr -m, --method [ocr|txt|auto] the method for parsing pdf. ocr: using ocr
technique to extract information from pdf. txt: technique to extract information from pdf. txt:
...@@ -40,6 +41,20 @@ Command Line ...@@ -40,6 +41,20 @@ Command Line
## command line example ## command line example
magic-pdf -p {some_pdf} -o {some_output_dir} -m auto magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
.. admonition:: Important
:class: tip
The file must endswith with the following suffix.
.pdf
.png
.jpg
.ppt
.pptx
.doc
.docx
``{some_pdf}`` can be a single PDF file or a directory containing ``{some_pdf}`` can be a single PDF file or a directory containing
multiple PDFs. The results will be saved in the ``{some_output_dir}`` multiple PDFs. The results will be saved in the ``{some_output_dir}``
directory. The output file list is as follows: directory. The output file list is as follows:
...@@ -57,6 +72,6 @@ directory. The output file list is as follows: ...@@ -57,6 +72,6 @@ directory. The output file list is as follows:
.. admonition:: Tip .. admonition:: Tip
:class: tip :class: tip
For more information about the output files, please refer to the :doc:`../tutorial/output_file_description` For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`
Docker
=======
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
...@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D ...@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
--------- ---------
.. code:: python .. code:: python
from magic_pdf.data.data_reader_writer import * import os
from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# 文件相关的 # 初始化 reader
file_based_reader1 = FileBasedDataReader('') file_based_reader1 = FileBasedDataReader('')
## 将读取文件 abc ## 读本地文件 abc
file_based_reader1.read('abc') file_based_reader1.read('abc')
file_based_reader2 = FileBasedDataReader('/tmp') file_based_reader2 = FileBasedDataReader('/tmp')
## 将读取 /tmp/abc ## 读本地文件 /tmp/abc
file_based_reader2.read('abc') file_based_reader2.read('abc')
## 将读取 /var/logs/message.txt ## 读本地文件 /tmp/logs/message.txt
file_based_reader2.read('/var/logs/message.txt') file_based_reader2.read('/tmp/logs/message.txt')
# 初始化多桶 s3 reader
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
# 多桶 S3 相关的 test_prefix = 'test/unittest'
multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config( multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
), ),
S3Config( S3Config(
bucket_name=test_bucket_2, bucket_name=bucket_2,
access_key=ak_2, access_key=ak_2,
secret_key=sk_2, secret_key=sk_2,
endpoint_url=endpoint_url_2, endpoint_url=endpoint_url_2,
)]) )])
## 将读取 s3://test_bucket1/test_prefix/abc ## 读文件 s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_reader1.read('abc') multi_bucket_s3_reader1.read('abc')
## 将读取 s3://test_bucket1/efg ## 读文件 s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_reader1.read('s3://test_bucket1/efg') multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
## 将读取 s3://test_bucket2/abc ## 读文件 s3://{bucket2}/{test_prefix}/abc
multi_bucket_s3_reader1.read('s3://test_bucket2/abc') multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
# S3 相关的 # 初始化 s3 reader
s3_reader1 = S3DataReader( s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix", test_prefix,
bucket: "test_bucket", bucket,
ak: "ak", ak,
sk: "sk", sk,
endpoint_url: "localhost" endpoint_url
) )
## 将读取 s3://test_bucket/test_prefix/abc ## 读文件 s3://{bucket}/{test_prefix}/abc
s3_reader1.read('abc') s3_reader1.read('abc')
## 将读取 s3://test_bucket/efg ## 读文件 s3://{bucket}/efg
s3_reader1.read('s3://test_bucket/efg') s3_reader1.read(f's3://{bucket}/efg')
写入示例 写入示例
---------- ----------
.. code:: python .. code:: python
import os
from magic_pdf.data.data_reader_writer import * from magic_pdf.data.data_reader_writer import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
from magic_pdf.data.schemas import S3Config
# 初始化 reader
file_based_writer1 = FileBasedDataWriter("")
## 写数据 123 to abc
file_based_writer1.write("abc", "123".encode())
## 写数据 123 to abc
file_based_writer1.write_string("abc", "123")
file_based_writer2 = FileBasedDataWriter("/tmp")
## 写数据 123 to /tmp/abc
file_based_writer2.write_string("abc", "123")
## 写数据 123 to /tmp/logs/message.txt
file_based_writer2.write_string("/tmp/logs/message.txt", "123")
# 初始化多桶 s3 writer
bucket = "bucket" # 替换为有效的 bucket
ak = "ak" # 替换为有效的 access key
sk = "sk" # 替换为有效的 secret key
endpoint_url = "endpoint_url" # 替换为有效的 endpoint_url
bucket_2 = "bucket_2" # 替换为有效的 bucket
ak_2 = "ak_2" # 替换为有效的 access key
sk_2 = "sk_2" # 替换为有效的 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 endpoint_url
test_prefix = "test/unittest"
multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
f"{bucket}/{test_prefix}",
[
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
],
)
# 文件相关的 ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
file_based_writer1 = FileBasedDataWriter('') multi_bucket_s3_writer1.write_string("abc", "123")
## 将写入 123 到 abc
file_based_writer1.write('abc', '123'.encode())
## 将写入 123 到 abc
file_based_writer1.write_string('abc', '123')
file_based_writer2 = FileBasedDataWriter('/tmp')
## 将写入 123 到 /tmp/abc
file_based_writer2.write_string('abc', '123')
## 将写入 123 到 /var/logs/message.txt
file_based_writer2.write_string('/var/logs/message.txt', '123')
# 多桶 S3 相关的
multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=test_bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
)])
## 将写入 123 到 s3://test_bucket1/test_prefix/abc
multi_bucket_s3_writer1.write_string('abc', '123')
## 将写入 123 s3://test_bucket1/test_prefix/abc ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
multi_bucket_s3_writer1.write('abc', '123'.encode()) multi_bucket_s3_writer1.write("abc", "123".encode())
## 将写入 123 s3://test_bucket1/efg ## 写数据 123 to s3://{bucket}/{test_prefix}/efg
multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode()) multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
## 将写入 123 s3://test_bucket2/abc ## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode()) multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
# S3 相关的 # 初始化 s3 writer
s3_writer1 = S3DataWriter( s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
default_prefix_without_bucket = "test_prefix",
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
)
## 将写入 123 s3://test_bucket/test_prefix/abc ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write('abc', '123'.encode()) s3_writer1.write("abc", "123".encode())
## 将写入 123 s3://test_bucket/test_prefix/abc ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
s3_writer1.write_string('abc', '123') s3_writer1.write_string("abc", "123")
## 将写入 123 s3://test_bucket/efg ## 写数据 123 to s3://{bucket}/efg
s3_writer1.write('s3://test_bucket/efg', '123'.encode()) s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
...@@ -15,13 +15,41 @@ read_jsonl ...@@ -15,13 +15,41 @@ read_jsonl
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# 从本地机器读取 JSONL # 读取本地 jsonl 文件
datasets = read_jsonl("tt.jsonl", None) datasets = read_jsonl("tt.jsonl", None) # 替换为有效的文件
# 读取 s3 jsonl 文件
bucket = "bucket_1" # 替换为有效的 s3 bucket
ak = "access_key_1" # 替换为有效的 s3 access key
sk = "secret_key_1" # 替换为有效的 s3 secret key
endpoint_url = "endpoint_url_1" # 替换为有效的 s3 endpoint url
bucket_2 = "bucket_2" # 替换为有效的 s3 bucket
ak_2 = "access_key_2" # 替换为有效的 s3 access key
sk_2 = "secret_key_2" # 替换为有效的 s3 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # 替换为有效的 s3 jsonl file
# 从远程 S3 读取 JSONL
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
read_local_pdfs read_local_pdfs
^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
...@@ -30,13 +58,13 @@ read_local_pdfs ...@@ -30,13 +58,13 @@ read_local_pdfs
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
# 读取 PDF 路径 # 读取 PDF 路径
datasets = read_local_pdfs("tt.pdf") datasets = read_local_pdfs("tt.pdf") # 替换为有效的文件
# 读取目录下的 PDF 文件 # 读取目录下的 PDF 文件
datasets = read_local_pdfs("pdfs/") datasets = read_local_pdfs("pdfs/") # 替换为有效的文件目录
read_local_images read_local_images
^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^
...@@ -45,10 +73,10 @@ read_local_images ...@@ -45,10 +73,10 @@ read_local_images
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
# 从图像路径读取 # 从图像路径读取
datasets = read_local_images("tt.png") datasets = read_local_images("tt.png") # 替换为有效的文件
# 从目录读取以 suffixes 数组中指定后缀结尾的文件 # 从目录读取以 suffixes 数组中指定后缀结尾的文件
datasets = read_local_images("images/", suffixes=["png", "jpg"]) datasets = read_local_images("images/", suffixes=["png", "jpg"]) # 替换为有效的文件目录
...@@ -19,7 +19,7 @@ def test_read_local_pdfs(): ...@@ -19,7 +19,7 @@ def test_read_local_pdfs():
def test_read_local_images(): def test_read_local_images():
datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['png']) datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['.png'])
assert len(datasets) == 2 assert len(datasets) == 2
assert len(datasets[0]) == 1 assert len(datasets[0]) == 1
assert len(datasets[1]) == 1 assert len(datasets[1]) == 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment