feat: add [figure | table] match [caption | footnote] match algorithm v2

feat: add Data api

feat: add [figure | table] match [caption | footnote] match algorithm v2
feat: add Data api
283b597a · icecraft · e36627be · 283b597a · 283b597a · 283b597a
Commit 283b597a authored Oct 19, 2024 by icecraft
20 changed files
--- a/.gitignore
+++ b/.gitignore
 *.tar
 *.tar.gz
 *.zip
 venv*/
 envs/
 slurm_logs/
 sync1.sh
 data_preprocess_pj1
 data-preparation1
 __pycache__
 *.log
 *.pyc
 .vscode
 debug/
 *.ipynb
 .idea
 # vscode history
 .history
 .DS_Store
 .env
 bad_words/
 bak/
 app/tests/*
 temp/
 tmp/
 tmp
 .vscode
 .vscode/
 ocr_demo
 .coveragerc
 /app/common/__init__.py
 /magic_pdf/config/__init__.py
 source.dev.env
 tmp
 projects/web/node_modules
 projects/web/dist
 projects/web_demo/web_demo/static/
+cli_debug/
+debug_utils/
+# sphinx docs
+_build/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
    rev: 5.0.4
    hooks:
      - id: flake8
-        args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
+        args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
@@ -12,11 +12,12 @@ repos:
    rev: v0.32.0
    hooks:
      - id: yapf
-        args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
+        args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
  - repo: https://github.com/codespell-project/codespell
    rev: v2.2.1
    hooks:
      - id: codespell
+        args: ['--skip', '*.json']
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0
    hooks:

--- a/docs/en/api.rst
+++ b/docs/en/api.rst
+Data Api
+------------------
+.. toctree::
+   :maxdepth: 2
+   api/dataset.rst
+   api/data_reader_writer.rst
+   api/read_api.rst
--- a/docs/en/api/data_reader_writer.rst
+++ b/docs/en/api/data_reader_writer.rst
+Data Reader Writer
+--------------------
+.. autoclass:: magic_pdf.data.data_reader_writer.DataReader
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
+   :members:
+   :inherited-members:
--- a/docs/en/api/dataset.rst
+++ b/docs/en/api/dataset.rst
+Dataset Api
+------------------
+.. autoclass:: magic_pdf.data.dataset.PageableData
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.dataset.Dataset
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.dataset.ImageDataset
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.dataset.PymuDocDataset
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.dataset.Image
+   :members:
+   :inherited-members:
+.. autoclass:: magic_pdf.data.dataset.Doc
+   :members:
+   :inherited-members:
--- a/docs/en/api/io.rst
+++ b/docs/en/api/io.rst
--- a/docs/en/api/read_api.rst
+++ b/docs/en/api/read_api.rst
+read_api Api
+------------------
+.. automodule:: magic_pdf.data.dataset.read_api
+   :members:
+   :inherited-members:
--- a/docs/en/api/schemas.rst
+++ b/docs/en/api/schemas.rst
--- a/docs/en/api/utils.rst
+++ b/docs/en/api/utils.rst
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -24,3 +24,15 @@ Welcome to the MinerU Documentation
   <a class="github-button" href="https://github.com/opendatalab/MinerU/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
   <a class="github-button" href="https://github.com/opendatalab/MinerU/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
   </p>
+API Reference
+-------------
+If you are looking for information on a specific function, class or
+method, this part of the documentation is for you.
+.. toctree::
+   :maxdepth: 2
+   api
--- a/magic_pdf/config/enums.py
+++ b/magic_pdf/config/enums.py
+import enum
+class SupportedPdfParseMethod(enum.Enum):
+    OCR = 'ocr'
+    TXT = 'txt'
--- a/magic_pdf/config/exceptions.py
+++ b/magic_pdf/config/exceptions.py
+class FileNotExisted(Exception):
+    def __init__(self, path):
+        self.path = path
+    def __str__(self):
+        return f'File {self.path} does not exist.'
+class InvalidConfig(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+    def __str__(self):
+        return f'Invalid config: {self.msg}'
+class InvalidParams(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+    def __str__(self):
+        return f'Invalid params: {self.msg}'
+class EmptyData(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+    def __str__(self):
+        return f'Empty data: {self.msg}'
--- a/magic_pdf/data/__init__.py
+++ b/magic_pdf/data/__init__.py
--- a/magic_pdf/data/data_reader_writer/__init__.py
+++ b/magic_pdf/data/data_reader_writer/__init__.py
+from magic_pdf.data.data_reader_writer.filebase import \
+    FileBasedDataReader  # noqa: F401
+from magic_pdf.data.data_reader_writer.filebase import \
+    FileBasedDataWriter  # noqa: F401
+from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
+    MultiBucketS3DataReader  # noqa: F401
+from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \
+    MultiBucketS3DataWriter  # noqa: F401
+from magic_pdf.data.data_reader_writer.s3 import S3DataReader  # noqa: F401
+from magic_pdf.data.data_reader_writer.s3 import S3DataWriter  # noqa: F401
+from magic_pdf.data.data_reader_writer.base import DataReader  # noqa: F401
+from magic_pdf.data.data_reader_writer.base import DataWriter  # noqa: F401
\ No newline at end of file
--- a/magic_pdf/data/data_reader_writer/base.py
+++ b/magic_pdf/data/data_reader_writer/base.py
+from abc import ABC, abstractmethod
+class DataReader(ABC):
+    def read(self, path: str) -> bytes:
+        """Read the file.
+        Args:
+            path (str): file path to read
+        Returns:
+            bytes: the content of the file
+        """
+        return self.read_at(path)
+    @abstractmethod
+    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+        """Read the file at offset and limit.
+        Args:
+            path (str): the file path
+            offset (int, optional): the number of bytes skipped. Defaults to 0.
+            limit (int, optional): the length of bytes want to read. Defaults to -1.
+        Returns:
+            bytes: the content of the file
+        """
+        pass
+class DataWriter(ABC):
+    @abstractmethod
+    def write(self, path: str, data: bytes) -> None:
+        """Write the data to the file.
+        Args:
+            path (str): the target file where to write
+            data (bytes): the data want to write
+        """
+        pass
+    def write_string(self, path: str, data: str) -> None:
+        """Write the data to file, the data will be encoded to bytes.
+        Args:
+            path (str): the target file where to write
+            data (str): the data want to write
+        """
+        self.write(path, data.encode())
--- a/magic_pdf/data/data_reader_writer/filebase.py
+++ b/magic_pdf/data/data_reader_writer/filebase.py
+import os
+from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
+class FileBasedDataReader(DataReader):
+    def __init__(self, parent_dir: str = ''):
+        """Initialized with parent_dir.
+        Args:
+            parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
+        """
+        self._parent_dir = parent_dir
+    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+        """Read at offset and limit.
+        Args:
+            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
+            offset (int, optional): the number of bytes skipped. Defaults to 0.
+            limit (int, optional): the length of bytes want to read. Defaults to -1.
+        Returns:
+            bytes: the content of file
+        """
+        fn_path = path
+        if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
+            fn_path = os.path.join(self._parent_dir, path)
+        with open(fn_path, 'rb') as f:
+            f.seek(offset)
+            if limit == -1:
+                return f.read()
+            else:
+                return f.read(limit)
+class FileBasedDataWriter(DataWriter):
+    def __init__(self, parent_dir: str = '') -> None:
+        """Initialized with parent_dir.
+        Args:
+            parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''.
+        """
+        self._parent_dir = parent_dir
+    def write(self, path: str, data: bytes) -> None:
+        """Write file with data.
+        Args:
+            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
+            data (bytes): the data want to write
+        """
+        fn_path = path
+        if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
+            fn_path = os.path.join(self._parent_dir, path)
+        with open(fn_path, 'wb') as f:
+            f.write(data)
--- a/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+++ b/magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
+from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
+from magic_pdf.data.io.s3 import S3Reader, S3Writer
+from magic_pdf.data.schemas import S3Config
+from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
+                                       remove_non_official_s3_args)
+class MultiS3Mixin:
+    def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
+        """Initialized with multiple s3 configs.
+        Args:
+            default_bucket (str): the default bucket name of the relative path
+            s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
+        Raises:
+            InvalidConfig: default bucket config not in s3_configs
+            InvalidConfig: bucket name not unique in s3_configs
+            InvalidConfig: default bucket must be provided
+        """
+        if len(default_bucket) == 0:
+            raise InvalidConfig('default_bucket must be provided')
+        found_default_bucket_config = False
+        for conf in s3_configs:
+            if conf.bucket_name == default_bucket:
+                found_default_bucket_config = True
+                break
+        if not found_default_bucket_config:
+            raise InvalidConfig(
+                f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
+            )
+        uniq_bucket = set([conf.bucket_name for conf in s3_configs])
+        if len(uniq_bucket) != len(s3_configs):
+            raise InvalidConfig(
+                f'the bucket_name in s3_configs: {s3_configs} must be unique'
+            )
+        self.default_bucket = default_bucket
+        self.s3_configs = s3_configs
+        self._s3_clients_h: dict = {}
+class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
+    def read(self, path: str) -> bytes:
+        """Read the path from s3, select diffect bucket client for each request
+        based on the path, also support range read.
+        Args:
+            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
+            for example: s3://bucket_name/path?0,100
+        Returns:
+            bytes: the content of s3 file
+        """
+        may_range_params = parse_s3_range_params(path)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_len = 0, -1
+        else:
+            byte_start, byte_len = int(may_range_params[0]), int(may_range_params[1])
+        path = remove_non_official_s3_args(path)
+        return self.read_at(path, byte_start, byte_len)
+    def __get_s3_client(self, bucket_name: str):
+        if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
+            raise InvalidParams(
+                f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
+            )
+        if bucket_name not in self._s3_clients_h:
+            conf = next(
+                filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
+            )
+            self._s3_clients_h[bucket_name] = S3Reader(
+                bucket_name,
+                conf.access_key,
+                conf.secret_key,
+                conf.endpoint_url,
+                conf.addressing_style,
+            )
+        return self._s3_clients_h[bucket_name]
+    def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
+        """Read the file with offset and limit, select diffect bucket client
+        for each request based on the path.
+        Args:
+            path (str): the file path
+            offset (int, optional): the number of bytes skipped. Defaults to 0.
+            limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
+        Returns:
+            bytes: the file content
+        """
+        if path.startswith('s3://'):
+            bucket_name, path = parse_s3path(path)
+            s3_reader = self.__get_s3_client(bucket_name)
+        else:
+            s3_reader = self.__get_s3_client(self.default_bucket)
+        return s3_reader.read_at(path, offset, limit)
+class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
+    def __get_s3_client(self, bucket_name: str):
+        if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]):
+            raise InvalidParams(
+                f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}'
+            )
+        if bucket_name not in self._s3_clients_h:
+            conf = next(
+                filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs)
+            )
+            self._s3_clients_h[bucket_name] = S3Writer(
+                bucket_name,
+                conf.access_key,
+                conf.secret_key,
+                conf.endpoint_url,
+                conf.addressing_style,
+            )
+        return self._s3_clients_h[bucket_name]
+    def write(self, path: str, data: bytes) -> None:
+        """Write file with data, also select diffect bucket client for each
+        request based on the path.
+        Args:
+            path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
+            data (bytes): the data want to write
+        """
+        if path.startswith('s3://'):
+            bucket_name, path = parse_s3path(path)
+            s3_writer = self.__get_s3_client(bucket_name)
+        else:
+            s3_writer = self.__get_s3_client(self.default_bucket)
+        return s3_writer.write(path, data)
--- a/magic_pdf/data/data_reader_writer/s3.py
+++ b/magic_pdf/data/data_reader_writer/s3.py
+from magic_pdf.data.data_reader_writer.multi_bucket_s3 import (
+    MultiBucketS3DataReader, MultiBucketS3DataWriter)
+from magic_pdf.data.schemas import S3Config
+class S3DataReader(MultiBucketS3DataReader):
+    def __init__(
+        self,
+        bucket: str,
+        ak: str,
+        sk: str,
+        endpoint_url: str,
+        addressing_style: str = 'auto',
+    ):
+        """s3 reader client.
+        Args:
+            bucket (str): bucket name
+            ak (str): access key
+            sk (str): secret key
+            endpoint_url (str): endpoint url of s3
+            addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
+            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
+        """
+        super().__init__(
+            bucket,
+            [
+                S3Config(
+                    bucket_name=bucket,
+                    access_key=ak,
+                    secret_key=sk,
+                    endpoint_url=endpoint_url,
+                    addressing_style=addressing_style,
+                )
+            ],
+        )
+class S3DataWriter(MultiBucketS3DataWriter):
+    def __init__(
+        self,
+        bucket: str,
+        ak: str,
+        sk: str,
+        endpoint_url: str,
+        addressing_style: str = 'auto',
+    ):
+        """s3 writer client.
+        Args:
+            bucket (str): bucket name
+            ak (str): access key
+            sk (str): secret key
+            endpoint_url (str): endpoint url of s3
+            addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual'
+            refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
+        """
+        super().__init__(
+            bucket,
+            [
+                S3Config(
+                    bucket_name=bucket,
+                    access_key=ak,
+                    secret_key=sk,
+                    endpoint_url=endpoint_url,
+                    addressing_style=addressing_style,
+                )
+            ],
+        )
--- a/magic_pdf/data/dataset.py
+++ b/magic_pdf/data/dataset.py
+from abc import ABC, abstractmethod
+from typing import Iterator
+import fitz
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.schemas import PageInfo
+from magic_pdf.data.utils import fitz_doc_to_image
+class PageableData(ABC):
+    @abstractmethod
+    def get_image(self) -> dict:
+        """Transform data to image."""
+        pass
+    @abstractmethod
+    def get_doc(self) -> fitz.Page:
+        """Get the pymudoc page."""
+        pass
+    @abstractmethod
+    def get_page_info(self) -> PageInfo:
+        """Get the page info of the page.
+        Returns:
+            PageInfo: the page info of this page
+        """
+        pass
+class Dataset(ABC):
+    @abstractmethod
+    def __len__(self) -> int:
+        """The length of the dataset."""
+        pass
+    @abstractmethod
+    def __iter__(self) -> Iterator[PageableData]:
+        """Yield the page data."""
+        pass
+    @abstractmethod
+    def supported_methods(self) -> list[SupportedPdfParseMethod]:
+        """The methods that this dataset support.
+        Returns:
+            list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
+        """
+        pass
+    @abstractmethod
+    def data_bits(self) -> bytes:
+        """The bits used to create this dataset."""
+        pass
+    @abstractmethod
+    def get_page(self, page_id: int) -> PageableData:
+        """Get the page indexed by page_id.
+        Args:
+            page_id (int): the index of the page
+        Returns:
+            PageableData: the page doc object
+        """
+        pass
+class PymuDocDataset(Dataset):
+    def __init__(self, bits: bytes):
+        """Initialize the dataset, which wraps the pymudoc documents.
+        Args:
+            bits (bytes): the bytes of the pdf
+        """
+        self._records = [Doc(v) for v in fitz.open('pdf', bits)]
+        self._data_bits = bits
+        self._raw_data = bits
+    def __len__(self) -> int:
+        """The page number of the pdf."""
+        return len(self._records)
+    def __iter__(self) -> Iterator[PageableData]:
+        """Yield the page doc object."""
+        return iter(self._records)
+    def supported_methods(self) -> list[SupportedPdfParseMethod]:
+        """The method supported by this dataset.
+        Returns:
+            list[SupportedPdfParseMethod]: the supported methods
+        """
+        return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]
+    def data_bits(self) -> bytes:
+        """The pdf bits used to create this dataset."""
+        return self._data_bits
+    def get_page(self, page_id: int) -> PageableData:
+        """The page doc object.
+        Args:
+            page_id (int): the page doc index
+        Returns:
+            PageableData: the page doc object
+        """
+        return self._records[page_id]
+class ImageDataset(Dataset):
+    def __init__(self, bits: bytes):
+        """Initialize the dataset, which wraps the pymudoc documents.
+        Args:
+            bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
+        """
+        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
+        self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
+        self._raw_data = bits
+        self._data_bits = pdf_bytes
+    def __len__(self) -> int:
+        """The length of the dataset."""
+        return len(self._records)
+    def __iter__(self) -> Iterator[PageableData]:
+        """Yield the page object."""
+        return iter(self._records)
+    def supported_methods(self):
+        """The method supported by this dataset.
+        Returns:
+            list[SupportedPdfParseMethod]: the supported methods
+        """
+        return [SupportedPdfParseMethod.OCR]
+    def data_bits(self) -> bytes:
+        """The pdf bits used to create this dataset."""
+        return self._data_bits
+    def get_page(self, page_id: int) -> PageableData:
+        """The page doc object.
+        Args:
+            page_id (int): the page doc index
+        Returns:
+            PageableData: the page doc object
+        """
+        return self._records[page_id]
+class Doc(PageableData):
+    """Initialized with pymudoc object."""
+    def __init__(self, doc: fitz.Page):
+        self._doc = doc
+    def get_image(self):
+        """Return the imge info.
+        Returns:
+            dict: {
+                img: np.ndarray,
+                width: int,
+                height: int
+            }
+        """
+        return fitz_doc_to_image(self._doc)
+    def get_doc(self) -> fitz.Page:
+        """Get the pymudoc object.
+        Returns:
+            fitz.Page: the pymudoc object
+        """
+        return self._doc
+    def get_page_info(self) -> PageInfo:
+        """Get the page info of the page.
+        Returns:
+            PageInfo: the page info of this page
+        """
+        page_w = self._doc.rect.width
+        page_h = self._doc.rect.height
+        return PageInfo(w=page_w, h=page_h)
+    def __getattr__(self, name):
+        if hasattr(self._doc, name):
+            return getattr(self._doc, name)
--- a/magic_pdf/data/io/__init__.py
+++ b/magic_pdf/data/io/__init__.py