Merge pull request #833 from icecraft/feat/tune_docs

Feat/tune docs

Merge pull request #833 from icecraft/feat/tune_docs
Feat/tune docs
8b119e22 · Xiaomeng Zhao · GitHub · 099f19f2 · 065bf993 · 8b119e22
Unverified Commit 8b119e22 authored Nov 01, 2024 by Xiaomeng Zhao Committed by GitHub Nov 01, 2024
20 changed files
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:
 python:
  install:
-    - requirements: docs/zh_cn/requirements.txt
+    - requirements: next_docs/zh_cn/requirements.txt
 sphinx:
-  configuration: docs/zh_cn/conf.py
+  configuration: next_docs/zh_cn/conf.py
--- a/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+++ b/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+import os
 from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
 from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
 from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
 class MultiS3Mixin:
-    def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
+    def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
        """Initialized with multiple s3 configs.
        Args:
-            default_bucket (str): the default bucket name of the relative path
+            default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
            s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
        Raises:
-            InvalidConfig: default bucket config not in s3_configs
+            InvalidConfig: default bucket config not in s3_configs.
-            InvalidConfig: bucket name not unique in s3_configs
+            InvalidConfig: bucket name not unique in s3_configs.
-            InvalidConfig: default bucket must be provided
+            InvalidConfig: default bucket must be provided.
        """
-        if len(default_bucket) == 0:
+        if len(default_prefix) == 0:
-            raise InvalidConfig('default_bucket must be provided')
+            raise InvalidConfig('default_prefix must be provided')
+        arr = default_prefix.strip("/").split("/")
+        self.default_bucket = arr[0]
+        self.default_prefix = "/".join(arr[1:])
        found_default_bucket_config = False
        for conf in s3_configs:
-            if conf.bucket_name == default_bucket:
+            if conf.bucket_name == self.default_bucket:
                found_default_bucket_config = True
                break
        if not found_default_bucket_config:
            raise InvalidConfig(
-                f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
+                f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
            )
        uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
                f'the bucket_name in s3_configs: {s3_configs} must be unique'
            )
-        self.default_bucket = default_bucket
        self.s3_configs = s3_configs
        self._s3_clients_h: dict = {}
@@ -47,14 +51,14 @@ class MultiS3Mixin:
 class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
    def read(self, path: str) -> bytes:
        """Read the path from s3, select diffect bucket client for each request
-        based on the path, also support range read.
+        based on the bucket, also support range read.
        Args:
-            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
+            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
-            for example: s3://bucket_name/path?0,100
+            for example: s3://bucket_name/path?0,100.
        Returns:
-            bytes: the content of s3 file
+            bytes: the content of s3 file.
        """
        may_range_params = parse_s3_range_params(path)
        if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
        """Read the file with offset and limit, select diffect bucket client
-        for each request based on the path.
+        for each request based on the bucket.
        Args:
-            path (str): the file path
+            path (str): the file path.
            offset (int, optional): the number of bytes skipped. Defaults to 0.
            limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
        Returns:
-            bytes: the file content
+            bytes: the file content.
        """
        if path.startswith('s3://'):
            bucket_name, path = parse_s3path(path)
            s3_reader = self.__get_s3_client(bucket_name)
        else:
            s3_reader = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
        return s3_reader.read_at(path, offset, limit)
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
    def write(self, path: str, data: bytes) -> None:
        """Write file with data, also select diffect bucket client for each
-        request based on the path.
+        request based on the bucket.
        Args:
            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
+            data (bytes): the data want to write.
        """
        if path.startswith('s3://'):
            bucket_name, path = parse_s3path(path)
            s3_writer = self.__get_s3_client(bucket_name)
        else:
            s3_writer = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
        return s3_writer.write(path, data)
--- a/magic_pdf/data/data_reader_writer/s3.py
+++ b/magic_pdf/data/data_reader_writer/s3.py
@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
 class S3DataReader(MultiBucketS3DataReader):
    def __init__(
        self,
+        default_prefix_without_bucket: str,
        bucket: str,
        ak: str,
        sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
        """s3 reader client.
        Args:
+            default_prefix_without_bucket: prefix that not contains bucket
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        super().__init__(
-            bucket,
+            f'{bucket}/{default_prefix_without_bucket}',
            [
                S3Config(
                    bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
 class S3DataWriter(MultiBucketS3DataWriter):
    def __init__(
        self,
+        default_prefix_without_bucket: str,
        bucket: str,
        ak: str,
        sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
        """s3 writer client.
        Args:
+            default_prefix_without_bucket: prefix that not contains bucket
            bucket (str): bucket name
            ak (str): access key
            sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
        """
        super().__init__(
-            bucket,
+            f'{bucket}/{default_prefix_without_bucket}',
            [
                S3Config(
                    bucket_name=bucket,

--- a/magic_pdf/data/io/__init__.py
+++ b/magic_pdf/data/io/__init__.py
+from magic_pdf.data.io.base import IOReader, IOWriter  # noqa: F401
+from magic_pdf.data.io.http import HttpReader, HttpWriter  # noqa: F401
+from magic_pdf.data.io.s3 import S3Reader, S3Writer  # noqa: F401
+__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
--- a/magic_pdf/data/io/base.py
+++ b/magic_pdf/data/io/base.py
@@ -29,7 +29,7 @@ class IOReader(ABC):
        pass
-class IOWriter:
+class IOWriter(ABC):
    @abstractmethod
    def write(self, path: str, data: bytes) -> None:

--- a/magic_pdf/data/schemas.py
+++ b/magic_pdf/data/schemas.py
@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
 class S3Config(BaseModel):
+    """S3 config
+    """
    bucket_name: str = Field(description='s3 bucket name', min_length=1)
    access_key: str = Field(description='s3 access key', min_length=1)
    secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
 class PageInfo(BaseModel):
+    """The width and height of page
+    """
    w: float = Field(description='the width of page')
    h: float = Field(description='the height of page')
--- a/next_docs/en/.readthedocs.yaml
+++ b/next_docs/en/.readthedocs.yaml
@@ -10,7 +10,7 @@ formats:
 python:
  install:
-    - requirements: docs/requirements.txt
+    - requirements: next_docs/requirements.txt
 sphinx:
-  configuration: docs/en/conf.py
+  configuration: next_docs/en/conf.py
--- a/next_docs/en/_static/image/MinerU-logo-hq.png
+++ b/next_docs/en/_static/image/MinerU-logo-hq.png
--- a/next_docs/en/_static/image/MinerU-logo.png
+++ b/next_docs/en/_static/image/MinerU-logo.png
--- a/next_docs/en/_static/image/datalab_logo.png
+++ b/next_docs/en/_static/image/datalab_logo.png
--- a/next_docs/en/_static/image/flowchart_en.png
+++ b/next_docs/en/_static/image/flowchart_en.png
--- a/next_docs/en/_static/image/flowchart_zh_cn.png
+++ b/next_docs/en/_static/image/flowchart_zh_cn.png
--- a/next_docs/en/_static/image/layout_example.png
+++ b/next_docs/en/_static/image/layout_example.png
--- a/next_docs/en/_static/image/poly.png
+++ b/next_docs/en/_static/image/poly.png
--- a/next_docs/en/_static/image/project_panorama_en.png
+++ b/next_docs/en/_static/image/project_panorama_en.png
--- a/next_docs/en/_static/image/project_panorama_zh_cn.png
+++ b/next_docs/en/_static/image/project_panorama_zh_cn.png
--- a/next_docs/en/_static/image/spans_example.png
+++ b/next_docs/en/_static/image/spans_example.png
--- a/next_docs/en/_static/image/web_demo_1.png
+++ b/next_docs/en/_static/image/web_demo_1.png
--- a/next_docs/en/additional_notes/changelog.rst
+++ b/next_docs/en/additional_notes/changelog.rst
+Changelog
+=========
+-  2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
+   `localized deployment version <projects/web_demo/README.md>`__ of the
+   `online
+   demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
+   the `front-end interface <projects/web/README.md>`__.
+-  2024/09/09: Version 0.8.0 released, supporting fast deployment with
+   Dockerfile, and launching demos on Huggingface and Modelscope.
+-  2024/08/30: Version 0.7.1 released, add paddle tablemaster table
+   recognition option
+-  2024/08/09: Version 0.7.0b1 released, simplified installation
+   process, added table recognition functionality
+-  2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
+   issues and installation documentation
+-  2024/07/05: Initial open-source release
+.. warning::
+   fix ``localized deployment version`` and ``front-end interface``
--- a/next_docs/en/additional_notes/faq.rst
+++ b/next_docs/en/additional_notes/faq.rst
+FAQ
+==========================
+1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+On macOS, the default shell has switched from Bash to Z shell, which has
+special handling logic for certain types of string matching. This can
+lead to the “no matches found” error. You can try disabling the globbing
+feature in the command line and then run the installation command again.
+.. code:: bash
+   setopt no_nomatch
+   pip install magic-pdf[full]
+2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This might be due to an incomplete download of the model file. You can
+try re-downloading the model file and then try again. Reference:
+https://github.com/opendatalab/MinerU/issues/143
+3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The path for the model files is configured in “magic-pdf.json”. just
+like:
+.. code:: json
+   {
+     "models-dir": "/tmp/models"
+   }
+This path is an absolute path, not a relative path. You can obtain the
+absolute path in the models directory using the “pwd” command.
+Reference:
+https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
+4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
+install the ``libgl`` library with the following command to resolve the
+issue:
+.. code:: bash
+   sudo apt-get install libgl1-mesa-glx
+Reference: https://github.com/opendatalab/MinerU/issues/388
+5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+You need to uninstall the module and reinstall it:
+.. code:: bash
+   pip uninstall fairscale
+   pip install fairscale
+Reference: https://github.com/opendatalab/MinerU/issues/411
+6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The compatibility of cuda11 with new graphics cards is poor, and the
+CUDA version used by Paddle needs to be upgraded.
+.. code:: bash
+   pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
+Reference: https://github.com/opendatalab/MinerU/issues/558