feat: add more docs about data releated api

47db844c · xu rui · 73afb7d6 · 47db844c · 47db844c · 47db844c
Commit 47db844c authored Nov 01, 2024 by xu rui
20 changed files
--- a/docs/en/api/io.rst
+++ b/docs/en/api/io.rst
+IO
+==
+.. autoclass:: magic_pdf.data.io.base.IOReader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+.. autoclass:: magic_pdf.data.io.base.IOWriter
+   :members:
+   :inherited-members:
+   :show-inheritance:
+.. autoclass:: magic_pdf.data.io.s3.S3Reader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+.. autoclass:: magic_pdf.data.io.s3.S3Writer
+   :members:
+   :inherited-members:
+   :show-inheritance:
+.. autoclass:: magic_pdf.data.io.http.HttpReader
+   :members:
+   :inherited-members:
+   :show-inheritance:
+.. autoclass:: magic_pdf.data.io.http.HttpWriter
+   :members:
+   :inherited-members:
+   :show-inheritance:
--- a/docs/en/api/schemas.rst
+++ b/docs/en/api/schemas.rst
+schemas 
+===========
+.. autopydantic_model:: magic_pdf.data.schemas.S3Config
+   :members:
+.. autopydantic_model:: magic_pdf.data.schemas.PageInfo
+   :members:
--- a/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+++ b/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+import os
 from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
 from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
 from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
 class MultiS3Mixin:
-    def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
+    def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
        """Initialized with multiple s3 configs.
        Args:
-            default_bucket (str): the default bucket name of the relative path
+            default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
            s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
        Raises:
-            InvalidConfig: default bucket config not in s3_configs
+            InvalidConfig: default bucket config not in s3_configs.
-            InvalidConfig: bucket name not unique in s3_configs
+            InvalidConfig: bucket name not unique in s3_configs.
-            InvalidConfig: default bucket must be provided
+            InvalidConfig: default bucket must be provided.
        """
-        if len(default_bucket) == 0:
+        if len(default_prefix) == 0:
-            raise InvalidConfig('default_bucket must be provided')
+            raise InvalidConfig('default_prefix must be provided')
+        arr = default_prefix.strip("/").split("/")
+        self.default_bucket = arr[0]
+        self.default_prefix = "/".join(arr[1:])
        found_default_bucket_config = False
        for conf in s3_configs:
-            if conf.bucket_name == default_bucket:
+            if conf.bucket_name == self.default_bucket:
                found_default_bucket_config = True
                break
        if not found_default_bucket_config:
            raise InvalidConfig(
-                f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
+                f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
            )
        uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
                f'the bucket_name in s3_configs: {s3_configs} must be unique'
            )
-        self.default_bucket = default_bucket
        self.s3_configs = s3_configs
        self._s3_clients_h: dict = {}
@@ -47,14 +51,14 @@ class MultiS3Mixin:
 class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
    def read(self, path: str) -> bytes:
        """Read the path from s3, select diffect bucket client for each request
-        based on the path, also support range read.
+        based on the bucket, also support range read.
        Args:
-            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
+            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
-            for example: s3://bucket_name/path?0,100
+            for example: s3://bucket_name/path?0,100.
        Returns:
-            bytes: the content of s3 file
+            bytes: the content of s3 file.
        """
        may_range_params = parse_s3_range_params(path)
        if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
        """Read the file with offset and limit, select diffect bucket client
-        for each request based on the path.
+        for each request based on the bucket.
        Args:
-            path (str): the file path
+            path (str): the file path.
            offset (int, optional): the number of bytes skipped. Defaults to 0.
            limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
        Returns:
-            bytes: the file content
+            bytes: the file content.
        """
        if path.startswith('s3://'):
            bucket_name, path = parse_s3path(path)
            s3_reader = self.__get_s3_client(bucket_name)
        else:
            s3_reader = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
        return s3_reader.read_at(path, offset, limit)
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
    def write(self, path: str, data: bytes) -> None:
        """Write file with data, also select diffect bucket client for each
-        request based on the path.
+        request based on the bucket.
        Args:
            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
+            data (bytes): the data want to write.
        """
        if path.startswith('s3://'):
            bucket_name, path = parse_s3path(path)
            s3_writer = self.__get_s3_client(bucket_name)
        else:
            s3_writer = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
        return s3_writer.write(path, data)
--- a/magic_pdf/data/data_reader_writer/s3.py
+++ b/magic_pdf/data/data_reader_writer/s3.py
@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
 class S3DataReader(MultiBucketS3DataReader):
    def __init__(
        self,
+        default_prefix_without_bucket: str,
        bucket: str,
        ak: str,
        sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
        """s3 reader client.
        Args:
+            default_prefix_without_bucket: prefix that not contains bucket
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        super().__init__(
-            bucket,
+            f"{bucket}/{default_prefix_without_bucket}"
            [
                S3Config(
                    bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
 class S3DataWriter(MultiBucketS3DataWriter):
    def __init__(
        self,
+        default_prefix_without_bucket: str,
        bucket: str,
        ak: str,
        sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
        """s3 writer client.
        Args:
+            default_prefix_without_bucket: prefix that not contains bucket
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        super().__init__(
-            bucket,
+            f"{bucket}/{default_prefix_without_bucket}"
            [
                S3Config(
                    bucket_name=bucket,

--- a/magic_pdf/data/io/__init__.py
+++ b/magic_pdf/data/io/__init__.py
+from magic_pdf.data.io.base import IOReader, IOWriter  # noqa: F401
+from magic_pdf.data.io.http import HttpReader, HttpWriter  # noqa: F401
+from magic_pdf.data.io.s3 import S3Reader, S3Writer  # noqa: F401
+__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
--- a/magic_pdf/data/io/base.py
+++ b/magic_pdf/data/io/base.py
@@ -29,7 +29,7 @@ class IOReader(ABC):
        pass
-class IOWriter:
+class IOWriter(ABC):
    @abstractmethod
    def write(self, path: str, data: bytes) -> None:

--- a/magic_pdf/data/schemas.py
+++ b/magic_pdf/data/schemas.py
@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
 class S3Config(BaseModel):
+    """S3 config
+    """
    bucket_name: str = Field(description='s3 bucket name', min_length=1)
    access_key: str = Field(description='s3 access key', min_length=1)
    secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
 class PageInfo(BaseModel):
+    """The width and height of page
+    """
    w: float = Field(description='the width of page')
    h: float = Field(description='the height of page')
--- a/next_docs/en/additional_notes/changelog.rst
+++ b/next_docs/en/additional_notes/changelog.rst
+Changelog
+=========
+-  2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
+   `localized deployment version <projects/web_demo/README.md>`__ of the
+   `online
+   demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
+   the `front-end interface <projects/web/README.md>`__.
+-  2024/09/09: Version 0.8.0 released, supporting fast deployment with
+   Dockerfile, and launching demos on Huggingface and Modelscope.
+-  2024/08/30: Version 0.7.1 released, add paddle tablemaster table
+   recognition option
+-  2024/08/09: Version 0.7.0b1 released, simplified installation
+   process, added table recognition functionality
+-  2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
+   issues and installation documentation
+-  2024/07/05: Initial open-source release
\ No newline at end of file
--- a/next_docs/en/additional_notes/faq.rst
+++ b/next_docs/en/additional_notes/faq.rst
+FAQ
+==========================
+1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+On macOS, the default shell has switched from Bash to Z shell, which has
+special handling logic for certain types of string matching. This can
+lead to the “no matches found” error. You can try disabling the globbing
+feature in the command line and then run the installation command again.
+.. code:: bash
+   setopt no_nomatch
+   pip install magic-pdf[full]
+2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This might be due to an incomplete download of the model file. You can
+try re-downloading the model file and then try again. Reference:
+https://github.com/opendatalab/MinerU/issues/143
+3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The path for the model files is configured in “magic-pdf.json”. just
+like:
+.. code:: json
+   {
+     "models-dir": "/tmp/models"
+   }
+This path is an absolute path, not a relative path. You can obtain the
+absolute path in the models directory using the “pwd” command.
+Reference:
+https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
+4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
+install the ``libgl`` library with the following command to resolve the
+issue:
+.. code:: bash
+   sudo apt-get install libgl1-mesa-glx
+Reference: https://github.com/opendatalab/MinerU/issues/388
+5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+You need to uninstall the module and reinstall it:
+.. code:: bash
+   pip uninstall fairscale
+   pip install fairscale
+Reference: https://github.com/opendatalab/MinerU/issues/411
+6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The compatibility of cuda11 with new graphics cards is poor, and the
+CUDA version used by Paddle needs to be upgraded.
+.. code:: bash
+   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
+Reference: https://github.com/opendatalab/MinerU/issues/558
--- a/next_docs/en/additional_notes/glossary.rst
+++ b/next_docs/en/additional_notes/glossary.rst
+Glossary 
+===========
+1. jsonl 
+    TODO: add description
--- a/next_docs/en/additional_notes/known_issues.rst
+++ b/next_docs/en/additional_notes/known_issues.rst
+Known Issues
+============
+-  Reading order is based on the model’s sorting of text distribution in
+   space, which may become disordered under extremely complex layouts.
+-  Vertical text is not supported.
+-  Tables of contents and lists are recognized through rules; a few
+   uncommon list formats may not be identified.
+-  Only one level of headings is supported; hierarchical heading levels
+   are currently not supported.
+-  Code blocks are not yet supported in the layout model.
+-  Comic books, art books, elementary school textbooks, and exercise
+   books are not well-parsed yet
+-  Enabling OCR may produce better results in PDFs with a high density
+   of formulas
+-  If you are processing PDFs with a large number of formulas, it is
+   strongly recommended to enable the OCR function. When using PyMuPDF
+   to extract text, overlapping text lines can occur, leading to
+   inaccurate formula insertion positions.
--- a/next_docs/en/api.rst
+++ b/next_docs/en/api.rst
-Data Api
------------------
 .. toctree::
   :maxdepth: 2
-   api/dataset.rst
+   api/dataset
-   api/data_reader_writer.rst
+   api/data_reader_writer
-   api/read_api.rst
+   api/read_api
+   api/schemas
+   api/io
+   api/classes
\ No newline at end of file
--- a/next_docs/en/api/classes.rst
+++ b/next_docs/en/api/classes.rst
+Class Hierarchy
+===============
+.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
+   :parts: 2
+.. inheritance-diagram:: magic_pdf.data.dataset
+   :parts: 2
+.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
+   :parts: 2
--- a/next_docs/en/api/data_reader_writer.rst
+++ b/next_docs/en/api/data_reader_writer.rst
 Data Reader Writer
--------------------
+===================
 .. autoclass:: magic_pdf.data.data_reader_writer.DataReader
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
   :members:
   :inherited-members:
+   :show-inheritance:
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
-   :members:
-   :inherited-members:
-.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
-   :members:
-   :inherited-members:
 .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
   :members:
   :inherited-members:
+   :show-inheritance:
--- a/next_docs/en/api/dataset.rst
+++ b/next_docs/en/api/dataset.rst
-Dataset Api
+Dataset
------------------
+========
 .. autoclass:: magic_pdf.data.dataset.PageableData
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.dataset.Dataset
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.dataset.ImageDataset
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.dataset.PymuDocDataset
   :members:
   :inherited-members:
+   :show-inheritance:
 .. autoclass:: magic_pdf.data.dataset.Doc
   :members:
   :inherited-members:
+   :show-inheritance:
--- a/next_docs/en/api/read_api.rst
+++ b/next_docs/en/api/read_api.rst
-read_api Api
+read_api
------------------
+=========
 .. automodule:: magic_pdf.data.read_api
   :members:

--- a/next_docs/en/conf.py
+++ b/next_docs/en/conf.py
@@ -15,7 +15,8 @@ import subprocess
 import sys
 from sphinx.ext import autodoc
+from docutils import nodes
+from docutils.parsers.rst import Directive
 def install(package):
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
@@ -58,10 +59,20 @@ extensions = [
    'sphinx_copybutton',
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
+    'sphinx.ext.inheritance_diagram',
    'myst_parser',
    'sphinxarg.ext',
+    'sphinxcontrib.autodoc_pydantic',
 ]
+# class hierarchy diagram
+inheritance_graph_attrs = dict(rankdir="LR", size='"8.0, 12.0"', fontsize=14, ratio='compress')
+inheritance_node_attrs = dict(shape='ellipse', fontsize=14, height=0.75)
+inheritance_edge_attrs = dict(arrow='vee')
+autodoc_pydantic_model_show_json = True
+autodoc_pydantic_model_show_config_summary = False
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -120,3 +131,21 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter
 navigation_with_keys = False
+# add custom directive 
+class VideoDirective(Directive):
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec = {}
+    def run(self):
+        url = self.arguments[0]
+        video_node = nodes.raw('', f'<iframe width="560" height="315" src="{url}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>', format='html')
+        return [video_node]
+def setup(app):
+    app.add_directive('video', VideoDirective)
\ No newline at end of file
--- a/next_docs/en/index.rst
+++ b/next_docs/en/index.rst
@@ -26,6 +26,50 @@ Welcome to the MinerU Documentation
   </p>
+Project Introduction
+--------------------
+MinerU is a tool that converts PDFs into machine-readable formats (e.g.,
+markdown, JSON), allowing for easy extraction into any format. MinerU
+was born during the pre-training process of
+`InternLM <https://github.com/InternLM/InternLM>`__. We focus on solving
+symbol conversion issues in scientific literature and hope to contribute
+to technological development in the era of large models. Compared to
+well-known commercial products, MinerU is still young. If you encounter
+any issues or if the results are not as expected, please submit an issue
+on `issue <https://github.com/opendatalab/MinerU/issues>`__ and **attach
+the relevant PDF**.
+.. video:: https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
+Key Features
+------------
+-  Removes elements such as headers, footers, footnotes, and page
+   numbers while maintaining semantic continuity
+-  Outputs text in a human-readable order from multi-column documents
+-  Retains the original structure of the document, including titles,
+   paragraphs, and lists
+-  Extracts images, image captions, tables, and table captions
+-  Automatically recognizes formulas in the document and converts them
+   to LaTeX
+-  Automatically recognizes tables in the document and converts them to
+   LaTeX
+-  Automatically detects and enables OCR for corrupted PDFs
+-  Supports both CPU and GPU environments
+-  Supports Windows, Linux, and Mac platforms
+User Guide
+-------------
+.. toctree::
+   :maxdepth: 2
+   :caption: User Guide
+   user_guide
 API Reference
 -------------
@@ -34,5 +78,27 @@ method, this part of the documentation is for you.
 .. toctree::
   :maxdepth: 2
+   :caption: API
   api
+Additional Notes
+------------------
+.. toctree::
+   :maxdepth: 1
+   :caption: Additional Notes
+   additional_notes/known_issues
+   additional_notes/faq
+   additional_notes/changelog
+   additional_notes/glossary
+Projects 
+---------
+.. toctree::
+   :maxdepth: 1
+   :caption: Projects
+   projects
\ No newline at end of file
--- a/next_docs/en/projects.rst
+++ b/next_docs/en/projects.rst
+llama_index_rag 
+===============
+gradio_app
+============
+other projects
+===============
\ No newline at end of file
--- a/next_docs/en/user_guide.rst
+++ b/next_docs/en/user_guide.rst
+.. toctree::
+    :maxdepth: 2
+    user_guide/install
+    user_guide/quick_start
+    user_guide/tutorial
+    user_guide/data