Merge pull request #1257 from icecraft/docs/refactor_en_docs

Docs/refactor en docs

Merge pull request #1257 from icecraft/docs/refactor_en_docs
Docs/refactor en docs
bdacf291 · Xiaomeng Zhao · GitHub · 2df3e901 · 302a6950 · bdacf291
Unverified Commit bdacf291 authored Dec 11, 2024 by Xiaomeng Zhao Committed by GitHub Dec 11, 2024
16 changed files
--- a/next_docs/en/user_guide/quick_start/convert_doc.rst
+++ b/next_docs/en/user_guide/quick_start/convert_doc.rst
+Convert Doc
+=============
+.. admonition:: Warning
+    :class: tip
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.doc -o output -m auto
+API 
+^^^^^^^^
+.. code:: python 
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_doc.doc"     # replace with real ms-office file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
--- a/next_docs/en/user_guide/quick_start/convert_docx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_docx.rst
+Convert DocX
+=============
+.. admonition:: Warning
+    :class: tip
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.docx -o output -m auto
+API 
+^^^^^
+.. code:: python 
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_docx.docx"     # replace with real ms-office file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
--- a/next_docs/en/user_guide/quick_start/convert_image.rst
+++ b/next_docs/en/user_guide/quick_start/convert_image.rst
+Convert Image
+===============
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.png -o output -m auto
+API 
+^^^^^^
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
--- a/next_docs/en/user_guide/quick_start/convert_pdf.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst
+Convert PDF 
+============
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.pdf -o output -m auto
+API
+^^^^^^
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.data.dataset import PymuDocDataset
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    # args
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+    name_without_suff = pdf_file_name.split(".")[0]
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # read bytes
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
--- a/next_docs/en/user_guide/quick_start/convert_ppt.rst
+++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst
+Convert PPT 
+============
+.. admonition:: Warning
+    :class: tip
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.ppt -o output -m auto
+API 
+^^^^^
+.. code:: python 
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
--- a/next_docs/en/user_guide/quick_start/convert_pptx.rst
+++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst
+Convert PPTX
+=================
+.. admonition:: Warning
+    :class: tip
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+Command Line
+^^^^^^^^^^^^^
+.. code:: python 
+    # make sure the file have correct suffix
+    magic-pdf -p a.pptx -o output -m auto
+API 
+^^^^^^
+.. code:: python 
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_pptx.pptx"     # replace with real ms-office file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
--- a/next_docs/en/user_guide/tutorial.rst
+++ b/next_docs/en/user_guide/tutorial.rst
@@ -7,6 +7,5 @@ From the beginning to the end, Show how to using mineru via a minimal project
 .. toctree::
    :maxdepth: 1
-    tutorial/output_file_description
    tutorial/pipeline
--- a/next_docs/en/user_guide/tutorial/pipeline.rst
+++ b/next_docs/en/user_guide/tutorial/pipeline.rst
@@ -28,7 +28,6 @@ Minimal Example
    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
        local_md_dir
    )
-    image_dir = str(os.path.basename(local_image_dir))
    # read bytes
    reader1 = FileBasedDataReader("")
@@ -85,8 +84,6 @@ These stages are linked together through methods like ``apply``, ``doc_analyze``
 .. admonition:: Tip
    :class: tip
-    For more examples on how to use ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../quick_start/to_markdown`
    For more detailed information about ``Dataset``, ``InferenceResult``, and ``PipeResult``, please refer to :doc:`../../api/dataset`, :doc:`../../api/model_operators`, :doc:`../../api/pipe_operators`

--- a/next_docs/en/user_guide/usage.rst
+++ b/next_docs/en/user_guide/usage.rst
+Usage
+========
+.. toctree::
+   :maxdepth: 1
+   usage/command_line
+   usage/api
+   usage/docker
--- a/next_docs/en/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/en/user_guide/quick_start/to_markdown.rst
+Api Usage 
+===========
-Convert To Markdown
-========================
+PDF
+----
 Local File Example
 ^^^^^^^^^^^^^^^^^^
@@ -113,4 +115,112 @@ S3 File Example
    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")    # dump to remote s3
+MS-Office 
+----------
+.. code:: python 
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+This code snippet can be used to manipulate **ppt**, **pptx**, **doc**, **docx** file
+Image
+---------
+Single Image File 
+^^^^^^^^^^^^^^^^^^^
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+Directory That Contains Images 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python
+    import os
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+    os.makedirs(local_image_dir, exist_ok=True)
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+    # proc
+    ## Create Dataset Instance
+    input_directory = "some_image_dir/"       # replace with real directory that contains images
+    dss = read_local_images(input_directory, suffixes=['.png', '.jpg'])[0]  
+    count = 0
+    for ds in dss:
+        ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+            md_writer, f"{count}.md", image_dir
+        )
+        count += 1
 Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
--- a/next_docs/en/user_guide/quick_start/command_line.rst
+++ b/next_docs/en/user_guide/quick_start/command_line.rst
@@ -10,7 +10,8 @@ Command Line
   Options:
     -v, --version                display the version and exit
-     -p, --path PATH              local pdf filepath or directory  [required]
+     -p, --path PATH              local filepath or directory. support PDF, PPT,
+                                  PPTX, DOC, DOCX, PNG, JPG files  [required]
     -o, --output-dir PATH        output local directory  [required]
     -m, --method [ocr|txt|auto]  the method for parsing pdf. ocr: using ocr
                                  technique to extract information from pdf. txt:
@@ -40,6 +41,20 @@ Command Line
   ## command line example
   magic-pdf -p {some_pdf} -o {some_output_dir} -m auto
+.. admonition:: Important
+    :class: tip
+    The file must endswith with the following suffix.
+       .pdf 
+       .png
+       .jpg
+       .ppt
+       .pptx
+       .doc
+       .docx
 ``{some_pdf}`` can be a single PDF file or a directory containing
 multiple PDFs. The results will be saved in the ``{some_output_dir}``
 directory. The output file list is as follows:
@@ -57,6 +72,6 @@ directory. The output file list is as follows:
 .. admonition:: Tip
   :class: tip
-   For more information about the output files, please refer to the :doc:`../tutorial/output_file_description`
+   For more information about the output files, please refer to the :doc:`../inference_result` or :doc:`../pipe_result`
--- a/next_docs/en/user_guide/usage/docker.rst
+++ b/next_docs/en/user_guide/usage/docker.rst
+Docker 
+=======
+.. admonition:: Important
+   :class: tip
+   Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
+   Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker. 
+   .. code-block:: bash
+      bash  docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+.. code:: sh
+   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
+   docker build -t mineru:latest .
+   docker run --rm -it --gpus=all mineru:latest /bin/bash
+   magic-pdf --help
--- a/next_docs/zh_cn/_static/image/inference_result.png
+++ b/next_docs/zh_cn/_static/image/inference_result.png
--- a/next_docs/zh_cn/user_guide/data/data_reader_writer.rst
+++ b/next_docs/zh_cn/user_guide/data/data_reader_writer.rst
@@ -73,118 +73,146 @@ S3DataReader 基于 MultiBucketS3DataReader 构建，但仅支持单个桶。S3D
 ---------
 .. code:: python
-    from magic_pdf.data.data_reader_writer import * 
+    import os 
+    from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
-    # 文件相关的
+    # 初始化 reader
    file_based_reader1 = FileBasedDataReader('')
-    ## 将读取文件 abc 
+    ## 读本地文件 abc
-    file_based_reader1.read('abc') 
+    file_based_reader1.read('abc')
    file_based_reader2 = FileBasedDataReader('/tmp')
-    ## 将读取 /tmp/abc
+    ## 读本地文件 /tmp/abc
    file_based_reader2.read('abc')
-    ## 将读取 /var/logs/message.txt
+    ## 读本地文件 /tmp/logs/message.txt
-    file_based_reader2.read('/var/logs/message.txt')
+    file_based_reader2.read('/tmp/logs/message.txt')
+    # 初始化多桶 s3 reader
+    bucket = "bucket"               # 替换为有效的 bucket
+    ak = "ak"                       # 替换为有效的 access key
+    sk = "sk"                       # 替换为有效的 secret key
+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
+    ak_2 = "ak_2"                       # 替换为有效的 access key
+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
-    # 多桶 S3 相关的
+    test_prefix = 'test/unittest'
-    multi_bucket_s3_reader1 = MultiBucketS3DataReader("test_bucket1/test_prefix", list[S3Config(
+    multi_bucket_s3_reader1 = MultiBucketS3DataReader(f"{bucket}/{test_prefix}", [S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
        ),
        S3Config(
-            bucket_name=test_bucket_2,
+            bucket_name=bucket_2,
            access_key=ak_2,
            secret_key=sk_2,
            endpoint_url=endpoint_url_2,
        )])
-    ## 将读取 s3://test_bucket1/test_prefix/abc
+    ## 读文件 s3://{bucket}/{test_prefix}/abc
    multi_bucket_s3_reader1.read('abc')
-    ## 将读取 s3://test_bucket1/efg
+    ## 读文件 s3://{bucket}/{test_prefix}/efg
-    multi_bucket_s3_reader1.read('s3://test_bucket1/efg')
+    multi_bucket_s3_reader1.read(f's3://{bucket}/{test_prefix}/efg')
-    ## 将读取 s3://test_bucket2/abc
+    ## 读文件 s3://{bucket2}/{test_prefix}/abc
-    multi_bucket_s3_reader1.read('s3://test_bucket2/abc')
+    multi_bucket_s3_reader1.read(f's3://{bucket_2}/{test_prefix}/abc')
-    # S3 相关的
+    # 初始化 s3 reader
    s3_reader1 = S3DataReader(
-        default_prefix_without_bucket = "test_prefix",
+        test_prefix,
-        bucket: "test_bucket",
+        bucket,
-        ak: "ak",
+        ak,
-        sk: "sk",
+        sk,
-        endpoint_url: "localhost"
+        endpoint_url
    )
-    ## 将读取 s3://test_bucket/test_prefix/abc 
+    ## 读文件 s3://{bucket}/{test_prefix}/abc
    s3_reader1.read('abc')
-    ## 将读取 s3://test_bucket/efg
+    ## 读文件 s3://{bucket}/efg
-    s3_reader1.read('s3://test_bucket/efg')
+    s3_reader1.read(f's3://{bucket}/efg')
 写入示例
 ----------
 .. code:: python
+    import os
    from magic_pdf.data.data_reader_writer import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataWriter
+    from magic_pdf.data.schemas import S3Config
+    # 初始化 reader
+    file_based_writer1 = FileBasedDataWriter("")
+    ## 写数据 123 to abc
+    file_based_writer1.write("abc", "123".encode())
+    ## 写数据 123 to abc
+    file_based_writer1.write_string("abc", "123")
+    file_based_writer2 = FileBasedDataWriter("/tmp")
+    ## 写数据 123 to /tmp/abc
+    file_based_writer2.write_string("abc", "123")
+    ## 写数据 123 to /tmp/logs/message.txt
+    file_based_writer2.write_string("/tmp/logs/message.txt", "123")
+    # 初始化多桶 s3 writer
+    bucket = "bucket"               # 替换为有效的 bucket
+    ak = "ak"                       # 替换为有效的 access key
+    sk = "sk"                       # 替换为有效的 secret key
+    endpoint_url = "endpoint_url"   # 替换为有效的 endpoint_url
+    bucket_2 = "bucket_2"               # 替换为有效的 bucket
+    ak_2 = "ak_2"                       # 替换为有效的 access key
+    sk_2 = "sk_2"                       # 替换为有效的 secret key 
+    endpoint_url_2 = "endpoint_url_2"   # 替换为有效的 endpoint_url
+    test_prefix = "test/unittest"
+    multi_bucket_s3_writer1 = MultiBucketS3DataWriter(
+        f"{bucket}/{test_prefix}",
+        [
+            S3Config(
+                bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+            ),
+            S3Config(
+                bucket_name=bucket_2,
+                access_key=ak_2,
+                secret_key=sk_2,
+                endpoint_url=endpoint_url_2,
+            ),
+        ],
+    )
-    # 文件相关的
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
-    file_based_writer1 = FileBasedDataWriter('')
+    multi_bucket_s3_writer1.write_string("abc", "123")
-    ## 将写入 123 到 abc
-    file_based_writer1.write('abc', '123'.encode()) 
-    ## 将写入 123 到 abc
-    file_based_writer1.write_string('abc', '123') 
-    file_based_writer2 = FileBasedDataWriter('/tmp')
-    ## 将写入 123 到 /tmp/abc
-    file_based_writer2.write_string('abc', '123')
-    ## 将写入 123 到 /var/logs/message.txt
-    file_based_writer2.write_string('/var/logs/message.txt', '123')
-    # 多桶 S3 相关的
-    multi_bucket_s3_writer1 = MultiBucketS3DataWriter("test_bucket1/test_prefix", list[S3Config(
-            bucket_name=test_bucket1, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
-        ),
-        S3Config(
-            bucket_name=test_bucket_2,
-            access_key=ak_2,
-            secret_key=sk_2,
-            endpoint_url=endpoint_url_2,
-        )])
-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
-    multi_bucket_s3_writer1.write_string('abc', '123')
-    ## 将写入 123 到 s3://test_bucket1/test_prefix/abc
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
-    multi_bucket_s3_writer1.write('abc', '123'.encode())
+    multi_bucket_s3_writer1.write("abc", "123".encode())
-    ## 将写入 123 到 s3://test_bucket1/efg
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/efg
-    multi_bucket_s3_writer1.write('s3://test_bucket1/efg', '123'.encode())
+    multi_bucket_s3_writer1.write(f"s3://{bucket}/{test_prefix}/efg", "123".encode())
-    ## 将写入 123 到 s3://test_bucket2/abc
+    ## 写数据 123 to s3://{bucket_2}/{test_prefix}/abc
-    multi_bucket_s3_writer1.write('s3://test_bucket2/abc', '123'.encode())
+    multi_bucket_s3_writer1.write(f's3://{bucket_2}/{test_prefix}/abc', '123'.encode())
-    # S3 相关的
+    # 初始化 s3 writer
-    s3_writer1 = S3DataWriter(
+    s3_writer1 = S3DataWriter(test_prefix, bucket, ak, sk, endpoint_url)
-        default_prefix_without_bucket = "test_prefix",
-        bucket: "test_bucket",
-        ak: "ak",
-        sk: "sk",
-        endpoint_url: "localhost"
-    )
-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
-    s3_writer1.write('abc', '123'.encode())
+    s3_writer1.write("abc", "123".encode())
-    ## 将写入 123 到 s3://test_bucket/test_prefix/abc 
+    ## 写数据 123 to s3://{bucket}/{test_prefix}/abc
-    s3_writer1.write_string('abc', '123')
+    s3_writer1.write_string("abc", "123")
-    ## 将写入 123 到 s3://test_bucket/efg
+    ## 写数据 123 to s3://{bucket}/efg
-    s3_writer1.write('s3://test_bucket/efg', '123'.encode())
+    s3_writer1.write(f"s3://{bucket}/efg", "123".encode())
--- a/next_docs/zh_cn/user_guide/data/read_api.rst
+++ b/next_docs/zh_cn/user_guide/data/read_api.rst
@@ -15,13 +15,41 @@ read_jsonl
 .. code:: python
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
+    from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
+    from magic_pdf.data.schemas import S3Config
-    # 从本地机器读取 JSONL
+    # 读取本地 jsonl 文件
-    datasets = read_jsonl("tt.jsonl", None)
+    datasets = read_jsonl("tt.jsonl", None)   # 替换为有效的文件
+    # 读取 s3 jsonl 文件
+    bucket = "bucket_1"                     # 替换为有效的 s3 bucket
+    ak = "access_key_1"                     # 替换为有效的 s3 access key
+    sk = "secret_key_1"                     # 替换为有效的 s3 secret key
+    endpoint_url = "endpoint_url_1"         # 替换为有效的 s3 endpoint url
+    bucket_2 = "bucket_2"                   # 替换为有效的 s3 bucket
+    ak_2 = "access_key_2"                   # 替换为有效的 s3 access key
+    sk_2 = "secret_key_2"                   # 替换为有效的 s3 secret key
+    endpoint_url_2 = "endpoint_url_2"       # 替换为有效的 s3 endpoint url
+    s3configs = [
+        S3Config(
+            bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
+        ),
+        S3Config(
+            bucket_name=bucket_2,
+            access_key=ak_2,
+            secret_key=sk_2,
+            endpoint_url=endpoint_url_2,
+        ),
+    ]
+    s3_reader = MultiBucketS3DataReader(bucket, s3configs)
+    datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader)  # 替换为有效的 s3 jsonl file
-    # 从远程 S3 读取 JSONL
-    datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
 read_local_pdfs
 ^^^^^^^^^^^^^^^^
@@ -30,13 +58,13 @@ read_local_pdfs
 .. code:: python
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
    # 读取 PDF 路径
-    datasets = read_local_pdfs("tt.pdf")
+    datasets = read_local_pdfs("tt.pdf")  # 替换为有效的文件
    # 读取目录下的 PDF 文件
-    datasets = read_local_pdfs("pdfs/")
+    datasets = read_local_pdfs("pdfs/")   # 替换为有效的文件目录
 read_local_images
 ^^^^^^^^^^^^^^^^^^^
@@ -45,10 +73,10 @@ read_local_images
 .. code:: python
-    from magic_pdf.data.io.read_api import *
+    from magic_pdf.data.read_api import *
    # 从图像路径读取
-    datasets = read_local_images("tt.png")
+    datasets = read_local_images("tt.png")  # 替换为有效的文件
    # 从目录读取以 suffixes 数组中指定后缀结尾的文件
-    datasets = read_local_images("images/", suffixes=["png", "jpg"])
+    datasets = read_local_images("images/", suffixes=["png", "jpg"])  # 替换为有效的文件目录
--- a/tests/unittest/test_data/test_read_api.py
+++ b/tests/unittest/test_data/test_read_api.py
@@ -19,7 +19,7 @@ def test_read_local_pdfs():
 def test_read_local_images():
-    datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['png'])
+    datasets = read_local_images('tests/unittest/test_data/assets/pngs', suffixes=['.png'])
    assert len(datasets) == 2
    assert len(datasets[0]) == 1
    assert len(datasets[1]) == 1