Commit 47db844c authored by xu rui's avatar xu rui
Browse files

feat: add more docs about data releated api

parent 73afb7d6
IO
==
.. autoclass:: magic_pdf.data.io.base.IOReader
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.io.base.IOWriter
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.io.s3.S3Reader
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.io.s3.S3Writer
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.io.http.HttpReader
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.io.http.HttpWriter
:members:
:inherited-members:
:show-inheritance:
schemas
===========
.. autopydantic_model:: magic_pdf.data.schemas.S3Config
:members:
.. autopydantic_model:: magic_pdf.data.schemas.PageInfo
:members:
import os
from magic_pdf.config.exceptions import InvalidConfig, InvalidParams from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
from magic_pdf.data.io.s3 import S3Reader, S3Writer from magic_pdf.data.io.s3 import S3Reader, S3Writer
...@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path, ...@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
class MultiS3Mixin: class MultiS3Mixin:
def __init__(self, default_bucket: str, s3_configs: list[S3Config]): def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
"""Initialized with multiple s3 configs. """Initialized with multiple s3 configs.
Args: Args:
default_bucket (str): the default bucket name of the relative path default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list. s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
Raises: Raises:
InvalidConfig: default bucket config not in s3_configs InvalidConfig: default bucket config not in s3_configs.
InvalidConfig: bucket name not unique in s3_configs InvalidConfig: bucket name not unique in s3_configs.
InvalidConfig: default bucket must be provided InvalidConfig: default bucket must be provided.
""" """
if len(default_bucket) == 0: if len(default_prefix) == 0:
raise InvalidConfig('default_bucket must be provided') raise InvalidConfig('default_prefix must be provided')
arr = default_prefix.strip("/").split("/")
self.default_bucket = arr[0]
self.default_prefix = "/".join(arr[1:])
found_default_bucket_config = False found_default_bucket_config = False
for conf in s3_configs: for conf in s3_configs:
if conf.bucket_name == default_bucket: if conf.bucket_name == self.default_bucket:
found_default_bucket_config = True found_default_bucket_config = True
break break
if not found_default_bucket_config: if not found_default_bucket_config:
raise InvalidConfig( raise InvalidConfig(
f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}' f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
) )
uniq_bucket = set([conf.bucket_name for conf in s3_configs]) uniq_bucket = set([conf.bucket_name for conf in s3_configs])
...@@ -39,7 +44,6 @@ class MultiS3Mixin: ...@@ -39,7 +44,6 @@ class MultiS3Mixin:
f'the bucket_name in s3_configs: {s3_configs} must be unique' f'the bucket_name in s3_configs: {s3_configs} must be unique'
) )
self.default_bucket = default_bucket
self.s3_configs = s3_configs self.s3_configs = s3_configs
self._s3_clients_h: dict = {} self._s3_clients_h: dict = {}
...@@ -47,14 +51,14 @@ class MultiS3Mixin: ...@@ -47,14 +51,14 @@ class MultiS3Mixin:
class MultiBucketS3DataReader(DataReader, MultiS3Mixin): class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
def read(self, path: str) -> bytes: def read(self, path: str) -> bytes:
"""Read the path from s3, select diffect bucket client for each request """Read the path from s3, select diffect bucket client for each request
based on the path, also support range read. based on the bucket, also support range read.
Args: Args:
path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
for example: s3://bucket_name/path?0,100 for example: s3://bucket_name/path?0,100.
Returns: Returns:
bytes: the content of s3 file bytes: the content of s3 file.
""" """
may_range_params = parse_s3_range_params(path) may_range_params = parse_s3_range_params(path)
if may_range_params is None or 2 != len(may_range_params): if may_range_params is None or 2 != len(may_range_params):
...@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin): ...@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
"""Read the file with offset and limit, select diffect bucket client """Read the file with offset and limit, select diffect bucket client
for each request based on the path. for each request based on the bucket.
Args: Args:
path (str): the file path path (str): the file path.
offset (int, optional): the number of bytes skipped. Defaults to 0. offset (int, optional): the number of bytes skipped. Defaults to 0.
limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite. limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
Returns: Returns:
bytes: the file content bytes: the file content.
""" """
if path.startswith('s3://'): if path.startswith('s3://'):
bucket_name, path = parse_s3path(path) bucket_name, path = parse_s3path(path)
s3_reader = self.__get_s3_client(bucket_name) s3_reader = self.__get_s3_client(bucket_name)
else: else:
s3_reader = self.__get_s3_client(self.default_bucket) s3_reader = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path)
return s3_reader.read_at(path, offset, limit) return s3_reader.read_at(path, offset, limit)
...@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin): ...@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
def write(self, path: str, data: bytes) -> None: def write(self, path: str, data: bytes) -> None:
"""Write file with data, also select diffect bucket client for each """Write file with data, also select diffect bucket client for each
request based on the path. request based on the bucket.
Args: Args:
path (str): the path of file, if the path is relative path, it will be joined with parent_dir. path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
data (bytes): the data want to write data (bytes): the data want to write.
""" """
if path.startswith('s3://'): if path.startswith('s3://'):
bucket_name, path = parse_s3path(path) bucket_name, path = parse_s3path(path)
s3_writer = self.__get_s3_client(bucket_name) s3_writer = self.__get_s3_client(bucket_name)
else: else:
s3_writer = self.__get_s3_client(self.default_bucket) s3_writer = self.__get_s3_client(self.default_bucket)
path = os.path.join(self.default_prefix, path)
return s3_writer.write(path, data) return s3_writer.write(path, data)
...@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config ...@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
class S3DataReader(MultiBucketS3DataReader): class S3DataReader(MultiBucketS3DataReader):
def __init__( def __init__(
self, self,
default_prefix_without_bucket: str,
bucket: str, bucket: str,
ak: str, ak: str,
sk: str, sk: str,
...@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader): ...@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
"""s3 reader client. """s3 reader client.
Args: Args:
default_prefix_without_bucket: prefix that not contains bucket
bucket (str): bucket name bucket (str): bucket name
ak (str): access key ak (str): access key
sk (str): secret key sk (str): secret key
...@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader): ...@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
""" """
super().__init__( super().__init__(
bucket, f"{bucket}/{default_prefix_without_bucket}"
[ [
S3Config( S3Config(
bucket_name=bucket, bucket_name=bucket,
...@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader): ...@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
class S3DataWriter(MultiBucketS3DataWriter): class S3DataWriter(MultiBucketS3DataWriter):
def __init__( def __init__(
self, self,
default_prefix_without_bucket: str,
bucket: str, bucket: str,
ak: str, ak: str,
sk: str, sk: str,
...@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter): ...@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
"""s3 writer client. """s3 writer client.
Args: Args:
default_prefix_without_bucket: prefix that not contains bucket
bucket (str): bucket name bucket (str): bucket name
ak (str): access key ak (str): access key
sk (str): secret key sk (str): secret key
...@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter): ...@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
""" """
super().__init__( super().__init__(
bucket, f"{bucket}/{default_prefix_without_bucket}"
[ [
S3Config( S3Config(
bucket_name=bucket, bucket_name=bucket,
......
from magic_pdf.data.io.base import IOReader, IOWriter # noqa: F401
from magic_pdf.data.io.http import HttpReader, HttpWriter # noqa: F401
from magic_pdf.data.io.s3 import S3Reader, S3Writer # noqa: F401
__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']
\ No newline at end of file
...@@ -29,7 +29,7 @@ class IOReader(ABC): ...@@ -29,7 +29,7 @@ class IOReader(ABC):
pass pass
class IOWriter: class IOWriter(ABC):
@abstractmethod @abstractmethod
def write(self, path: str, data: bytes) -> None: def write(self, path: str, data: bytes) -> None:
......
...@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field ...@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
class S3Config(BaseModel): class S3Config(BaseModel):
"""S3 config
"""
bucket_name: str = Field(description='s3 bucket name', min_length=1) bucket_name: str = Field(description='s3 bucket name', min_length=1)
access_key: str = Field(description='s3 access key', min_length=1) access_key: str = Field(description='s3 access key', min_length=1)
secret_key: str = Field(description='s3 secret key', min_length=1) secret_key: str = Field(description='s3 secret key', min_length=1)
...@@ -11,5 +13,7 @@ class S3Config(BaseModel): ...@@ -11,5 +13,7 @@ class S3Config(BaseModel):
class PageInfo(BaseModel): class PageInfo(BaseModel):
"""The width and height of page
"""
w: float = Field(description='the width of page') w: float = Field(description='the width of page')
h: float = Field(description='the height of page') h: float = Field(description='the height of page')
Changelog
=========
- 2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
`localized deployment version <projects/web_demo/README.md>`__ of the
`online
demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
the `front-end interface <projects/web/README.md>`__.
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with
Dockerfile, and launching demos on Huggingface and Modelscope.
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table
recognition option
- 2024/08/09: Version 0.7.0b1 released, simplified installation
process, added table recognition functionality
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
issues and installation documentation
- 2024/07/05: Initial open-source release
\ No newline at end of file
FAQ
==========================
1. When using the command ``pip install magic-pdf[full]`` on newer versions of macOS, the error ``zsh: no matches found: magic-pdf[full]`` occurs.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
On macOS, the default shell has switched from Bash to Z shell, which has
special handling logic for certain types of string matching. This can
lead to the “no matches found” error. You can try disabling the globbing
feature in the command line and then run the installation command again.
.. code:: bash
setopt no_nomatch
pip install magic-pdf[full]
2. Encountering the error ``pickle.UnpicklingError: invalid load key, 'v'.`` during use
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This might be due to an incomplete download of the model file. You can
try re-downloading the model file and then try again. Reference:
https://github.com/opendatalab/MinerU/issues/143
3. Where should the model files be downloaded and how should the ``/models-dir`` configuration be set?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The path for the model files is configured in “magic-pdf.json”. just
like:
.. code:: json
{
"models-dir": "/tmp/models"
}
This path is an absolute path, not a relative path. You can obtain the
absolute path in the models directory using the “pwd” command.
Reference:
https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
4. Encountered the error ``ImportError: libGL.so.1: cannot open shared object file: No such file or directory`` in Ubuntu 22.04 on WSL2
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``libgl`` library is missing in Ubuntu 22.04 on WSL2. You can
install the ``libgl`` library with the following command to resolve the
issue:
.. code:: bash
sudo apt-get install libgl1-mesa-glx
Reference: https://github.com/opendatalab/MinerU/issues/388
5. Encountered error ``ModuleNotFoundError: No module named 'fairscale'``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You need to uninstall the module and reinstall it:
.. code:: bash
pip uninstall fairscale
pip install fairscale
Reference: https://github.com/opendatalab/MinerU/issues/411
6. On some newer devices like the H100, the text parsed during OCR using CUDA acceleration is garbled.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The compatibility of cuda11 with new graphics cards is poor, and the
CUDA version used by Paddle needs to be upgraded.
.. code:: bash
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
Reference: https://github.com/opendatalab/MinerU/issues/558
Glossary
===========
1. jsonl
TODO: add description
Known Issues
============
- Reading order is based on the model’s sorting of text distribution in
space, which may become disordered under extremely complex layouts.
- Vertical text is not supported.
- Tables of contents and lists are recognized through rules; a few
uncommon list formats may not be identified.
- Only one level of headings is supported; hierarchical heading levels
are currently not supported.
- Code blocks are not yet supported in the layout model.
- Comic books, art books, elementary school textbooks, and exercise
books are not well-parsed yet
- Enabling OCR may produce better results in PDFs with a high density
of formulas
- If you are processing PDFs with a large number of formulas, it is
strongly recommended to enable the OCR function. When using PyMuPDF
to extract text, overlapping text lines can occur, leading to
inaccurate formula insertion positions.
Data Api
------------------
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
api/dataset.rst api/dataset
api/data_reader_writer.rst api/data_reader_writer
api/read_api.rst api/read_api
api/schemas
api/io
api/classes
\ No newline at end of file
Class Hierarchy
===============
.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
:parts: 2
.. inheritance-diagram:: magic_pdf.data.dataset
:parts: 2
.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
:parts: 2
Data Reader Writer Data Reader Writer
-------------------- ===================
.. autoclass:: magic_pdf.data.data_reader_writer.DataReader .. autoclass:: magic_pdf.data.data_reader_writer.DataReader
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter .. autoclass:: magic_pdf.data.data_reader_writer.DataWriter
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader .. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter .. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter .. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter
:members:
:inherited-members:
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter .. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
Dataset Api Dataset
------------------ ========
.. autoclass:: magic_pdf.data.dataset.PageableData .. autoclass:: magic_pdf.data.dataset.PageableData
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.dataset.Dataset .. autoclass:: magic_pdf.data.dataset.Dataset
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.dataset.ImageDataset .. autoclass:: magic_pdf.data.dataset.ImageDataset
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.dataset.PymuDocDataset .. autoclass:: magic_pdf.data.dataset.PymuDocDataset
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
.. autoclass:: magic_pdf.data.dataset.Doc .. autoclass:: magic_pdf.data.dataset.Doc
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance:
read_api Api read_api
------------------ =========
.. automodule:: magic_pdf.data.read_api .. automodule:: magic_pdf.data.read_api
:members: :members:
......
...@@ -15,7 +15,8 @@ import subprocess ...@@ -15,7 +15,8 @@ import subprocess
import sys import sys
from sphinx.ext import autodoc from sphinx.ext import autodoc
from docutils import nodes
from docutils.parsers.rst import Directive
def install(package): def install(package):
subprocess.check_call([sys.executable, '-m', 'pip', 'install', package]) subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
...@@ -58,10 +59,20 @@ extensions = [ ...@@ -58,10 +59,20 @@ extensions = [
'sphinx_copybutton', 'sphinx_copybutton',
'sphinx.ext.autodoc', 'sphinx.ext.autodoc',
'sphinx.ext.autosummary', 'sphinx.ext.autosummary',
'sphinx.ext.inheritance_diagram',
'myst_parser', 'myst_parser',
'sphinxarg.ext', 'sphinxarg.ext',
'sphinxcontrib.autodoc_pydantic',
] ]
# class hierarchy diagram
inheritance_graph_attrs = dict(rankdir="LR", size='"8.0, 12.0"', fontsize=14, ratio='compress')
inheritance_node_attrs = dict(shape='ellipse', fontsize=14, height=0.75)
inheritance_edge_attrs = dict(arrow='vee')
autodoc_pydantic_model_show_json = True
autodoc_pydantic_model_show_config_summary = False
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ['_templates']
...@@ -120,3 +131,21 @@ class MockedClassDocumenter(autodoc.ClassDocumenter): ...@@ -120,3 +131,21 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
autodoc.ClassDocumenter = MockedClassDocumenter autodoc.ClassDocumenter = MockedClassDocumenter
navigation_with_keys = False navigation_with_keys = False
# add custom directive
class VideoDirective(Directive):
required_arguments = 1
optional_arguments = 0
final_argument_whitespace = True
option_spec = {}
def run(self):
url = self.arguments[0]
video_node = nodes.raw('', f'<iframe width="560" height="315" src="{url}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>', format='html')
return [video_node]
def setup(app):
app.add_directive('video', VideoDirective)
\ No newline at end of file
...@@ -26,6 +26,50 @@ Welcome to the MinerU Documentation ...@@ -26,6 +26,50 @@ Welcome to the MinerU Documentation
</p> </p>
Project Introduction
--------------------
MinerU is a tool that converts PDFs into machine-readable formats (e.g.,
markdown, JSON), allowing for easy extraction into any format. MinerU
was born during the pre-training process of
`InternLM <https://github.com/InternLM/InternLM>`__. We focus on solving
symbol conversion issues in scientific literature and hope to contribute
to technological development in the era of large models. Compared to
well-known commercial products, MinerU is still young. If you encounter
any issues or if the results are not as expected, please submit an issue
on `issue <https://github.com/opendatalab/MinerU/issues>`__ and **attach
the relevant PDF**.
.. video:: https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
Key Features
------------
- Removes elements such as headers, footers, footnotes, and page
numbers while maintaining semantic continuity
- Outputs text in a human-readable order from multi-column documents
- Retains the original structure of the document, including titles,
paragraphs, and lists
- Extracts images, image captions, tables, and table captions
- Automatically recognizes formulas in the document and converts them
to LaTeX
- Automatically recognizes tables in the document and converts them to
LaTeX
- Automatically detects and enables OCR for corrupted PDFs
- Supports both CPU and GPU environments
- Supports Windows, Linux, and Mac platforms
User Guide
-------------
.. toctree::
:maxdepth: 2
:caption: User Guide
user_guide
API Reference API Reference
------------- -------------
...@@ -34,5 +78,27 @@ method, this part of the documentation is for you. ...@@ -34,5 +78,27 @@ method, this part of the documentation is for you.
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: API
api api
Additional Notes
------------------
.. toctree::
:maxdepth: 1
:caption: Additional Notes
additional_notes/known_issues
additional_notes/faq
additional_notes/changelog
additional_notes/glossary
Projects
---------
.. toctree::
:maxdepth: 1
:caption: Projects
projects
\ No newline at end of file
llama_index_rag
===============
gradio_app
============
other projects
===============
\ No newline at end of file
.. toctree::
:maxdepth: 2
user_guide/install
user_guide/quick_start
user_guide/tutorial
user_guide/data
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment