Unverified Commit 8b119e22 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #833 from icecraft/feat/tune_docs

Feat/tune docs
parents 099f19f2 065bf993
...@@ -10,7 +10,7 @@ formats: ...@@ -10,7 +10,7 @@ formats:
python: python:
install: install:
- requirements: docs/zh_cn/requirements.txt - requirements: next_docs/zh_cn/requirements.txt
sphinx: sphinx:
configuration: docs/zh_cn/conf.py configuration: next_docs/zh_cn/conf.py
import os
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer from magic_pdf.data.io.s3 import S3Reader, S3Writer
...@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path, ...@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
class MultiS3Mixin: class MultiS3Mixin:
def __init__(self, default_bucket: str, s3_configs: list[S3Config]): def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
"""Initialized with multiple s3 configs. """Initialized with multiple s3 configs.
Args: Args:
default_bucket (str): the default bucket name of the relative path default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list. s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
Raises: Raises:
InvalidConfig: default bucket config not in s3_configs InvalidConfig: default bucket config not in s3_configs.
InvalidConfig: bucket name not unique in s3_configs InvalidConfig: bucket name not unique in s3_configs.
InvalidConfig: default bucket must be provided InvalidConfig: default bucket must be provided.
""" """
if len(default_bucket) == 0: if len(default_prefix) == 0:
raise InvalidConfig('default_bucket must be provided') raise InvalidConfig('default_prefix must be provided')
arr = default_prefix.strip("/").split("/")
self.default_bucket = arr[0]
self.default_prefix = "/".join(arr[1:])
found_default_bucket_config = False found_default_bucket_config = False
for conf in s3_configs: for conf in s3_configs:
if conf.bucket_name == default_bucket: if conf.bucket_name == self.default_bucket:
found_default_bucket_config = True found_default_bucket_config = True
break break
if not found_default_bucket_config: if not found_default_bucket_config:
raise InvalidConfig( raise InvalidConfig(
f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}' f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
) )
uniq_bucket = set([conf.bucket_name for conf in s3_configs]) uniq_bucket = set([conf.bucket_name for conf in s3_configs])
...@@ -39,7 +44,6 @@ class MultiS3Mixin: ...@@ -39,7 +44,6 @@ class MultiS3Mixin:
f'the bucket_name in s3_configs: {s3_configs} must be unique' f'the bucket_name in s3_configs: {s3_configs} must be unique'
) )
self.default_bucket = default_bucket
self.s3_configs = s3_configs self.s3_configs = s3_configs
self._s3_clients_h: dict = {} self._s3_clients_h: dict = {}
...@@ -47,14 +51,14 @@ class MultiS3Mixin: ...@@ -47,14 +51,14 @@ class MultiS3Mixin:
class MultiBucketS3DataReader(DataReader, MultiS3Mixin): class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
def read(self, path: str) -> bytes: def read(self, path: str) -> bytes:
"""Read the path from s3, select diffect bucket client for each request """Read the path from s3, select diffect bucket client for each request
based on the path, also support range read. based on the bucket, also support range read.
Args: Args:
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
for example: s3://bucket_name/path?0,100 for example: s3://bucket_name/path?0,100.
Returns: Returns:
bytes: the content of s3 file bytes: the content of s3 file.
""" """
may_range_params = parse_s3_range_params(path) may_range_params = parse_s3_range_params(path)
if may_range_params is None or 2 != len(may_range_params): if may_range_params is None or 2 != len(may_range_params):
...@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin): ...@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file with offset and limit, select diffect bucket client """Read the file with offset and limit, select diffect bucket client
for each request based on the path. for each request based on the bucket.
Args: Args:
path (str): the file path path (str): the file path.
offset (int, optional): the number of bytes skipped. Defaults to 0. offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite. limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
Returns: Returns:
bytes: the file content bytes: the file content.
""" """
if path.startswith('s3://'): if path.startswith('s3://'):
bucket_name, path = parse_s3path(path) bucket_name, path = parse_s3path(path)
s3_reader = self.__get_s3_client(bucket_name) s3_reader = self.__get_s3_client(bucket_name)
else: else:
s3_reader = self.__get_s3_client(self.default_bucket) s3_reader = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path)
return s3_reader.read_at(path, offset, limit) return s3_reader.read_at(path, offset, limit)
...@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin): ...@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
def write(self, path: str, data: bytes) -> None: def write(self, path: str, data: bytes) -> None:
"""Write file with data, also select diffect bucket client for each """Write file with data, also select diffect bucket client for each
request based on the path. request based on the bucket.
Args: Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir. path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write data (bytes): the data want to write.
""" """
if path.startswith('s3://'): if path.startswith('s3://'):
bucket_name, path = parse_s3path(path) bucket_name, path = parse_s3path(path)
s3_writer = self.__get_s3_client(bucket_name) s3_writer = self.__get_s3_client(bucket_name)
else: else:
s3_writer = self.__get_s3_client(self.default_bucket) s3_writer = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path)
return s3_writer.write(path, data) return s3_writer.write(path, data)
...@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config ...@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
class S3DataReader(MultiBucketS3DataReader): class S3DataReader(MultiBucketS3DataReader):
def __init__( def __init__(
self, self,
default_prefix_without_bucket: str,
bucket: str, bucket: str,
ak: str, ak: str,
sk: str, sk: str,
...@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader): ...@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
"""s3 reader client. """s3 reader client.
Args: Args:
default_prefix_without_bucket: prefix that not contains bucket
bucket (str): bucket name bucket (str): bucket name
ak (str): access key ak (str): access key
sk (str): secret key sk (str): secret key
...@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader): ...@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
""" """
super().__init__( super().__init__(
bucket, f'{bucket}/{default_prefix_without_bucket}',
[ [
S3Config( S3Config(
bucket_name=bucket, bucket_name=bucket,
...@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader): ...@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
class S3DataWriter(MultiBucketS3DataWriter): class S3DataWriter(MultiBucketS3DataWriter):
def __init__( def __init__(
self, self,
default_prefix_without_bucket: str,
bucket: str, bucket: str,
ak: str, ak: str,
sk: str, sk: str,
...@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter): ...@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
"""s3 writer client. """s3 writer client.
Args: Args:
default_prefix_without_bucket: prefix that not contains bucket
bucket (str): bucket name bucket (str): bucket name
ak (str): access key ak (str): access key
sk (str): secret key sk (str): secret key
...@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter): ...@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
""" """
super().__init__( super().__init__(
bucket, f'{bucket}/{default_prefix_without_bucket}',
[ [
S3Config( S3Config(
bucket_name=bucket, bucket_name=bucket,
......
from magic_pdf.data.io.base import IOReader, IOWriter # noqa: F401
from magic_pdf.data.io.http import HttpReader, HttpWriter # noqa: F401
from magic_pdf.data.io.s3 import S3Reader, S3Writer # noqa: F401
__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
...@@ -29,7 +29,7 @@ class IOReader(ABC): ...@@ -29,7 +29,7 @@ class IOReader(ABC):
pass pass
class IOWriter: class IOWriter(ABC):
@abstractmethod @abstractmethod
def write(self, path: str, data: bytes) -> None: def write(self, path: str, data: bytes) -> None:
......
...@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field ...@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
class S3Config(BaseModel): class S3Config(BaseModel):
"""S3 config
"""
bucket_name: str = Field(description='s3 bucket name', min_length=1) bucket_name: str = Field(description='s3 bucket name', min_length=1)
access_key: str = Field(description='s3 access key', min_length=1) access_key: str = Field(description='s3 access key', min_length=1)
secret_key: str = Field(description='s3 secret key', min_length=1) secret_key: str = Field(description='s3 secret key', min_length=1)
...@@ -11,5 +13,7 @@ class S3Config(BaseModel): ...@@ -11,5 +13,7 @@ class S3Config(BaseModel):
class PageInfo(BaseModel): class PageInfo(BaseModel):
"""The width and height of page
"""
w: float = Field(description='the width of page') w: float = Field(description='the width of page')
h: float = Field(description='the height of page') h: float = Field(description='the height of page')
...@@ -10,7 +10,7 @@ formats: ...@@ -10,7 +10,7 @@ formats:
python: python:
install: install:
- requirements: docs/requirements.txt - requirements: next_docs/requirements.txt
sphinx: sphinx:
configuration: docs/en/conf.py configuration: next_docs/en/conf.py
Changelog
=========
- 2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
`localized deployment version <projects/web_demo/README.md>`__ of the
`online
demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
the `front-end interface <projects/web/README.md>`__.
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with
Dockerfile, and launching demos on Huggingface and Modelscope.
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table
recognition option
- 2024/08/09: Version 0.7.0b1 released, simplified installation
process, added table recognition functionality
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
issues and installation documentation
- 2024/07/05: Initial open-source release
.. warning::
fix ``localized deployment version`` and ``front-end interface``
FAQ
==========================
1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
On macOS, the default shell has switched from Bash to Z shell, which has
special handling logic for certain types of string matching. This can
lead to the “no matches found” error. You can try disabling the globbing
feature in the command line and then run the installation command again.
.. code:: bash
setopt no_nomatch
pip install magic-pdf[full]
2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This might be due to an incomplete download of the model file. You can
try re-downloading the model file and then try again. Reference:
https://github.com/opendatalab/MinerU/issues/143
3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The path for the model files is configured in “magic-pdf.json”. just
like:
.. code:: json
{
"models-dir": "/tmp/models"
}
This path is an absolute path, not a relative path. You can obtain the
absolute path in the models directory using the “pwd” command.
Reference:
https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
install the ``libgl`` library with the following command to resolve the
issue:
.. code:: bash
sudo apt-get install libgl1-mesa-glx
Reference: https://github.com/opendatalab/MinerU/issues/388
5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You need to uninstall the module and reinstall it:
.. code:: bash
pip uninstall fairscale
pip install fairscale
Reference: https://github.com/opendatalab/MinerU/issues/411
6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The compatibility of cuda11 with new graphics cards is poor, and the
CUDA version used by Paddle needs to be upgraded.
.. code:: bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
Reference: https://github.com/opendatalab/MinerU/issues/558
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment