"git@developer.sourcefind.cn:change/sglang.git" did not exist on "52694b60dab3a0e755201e6cec4d7f14d5c8b716"
Commit 6ca86bea authored by xu rui's avatar xu rui
Browse files

docs: rewrite install and usage docs

parent fd2f3c58
...@@ -59,17 +59,20 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]: ...@@ -59,17 +59,20 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
""" """
if os.path.isdir(path): if os.path.isdir(path):
reader = FileBasedDataReader(path) reader = FileBasedDataReader(path)
return [ ret = []
PymuDocDataset(reader.read(doc_path.name)) for root, _, files in os.walk(path):
for doc_path in Path(path).glob('*.pdf') for file in files:
] suffix = file.split('.')
if suffix[-1] == 'pdf':
ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
return ret
else: else:
reader = FileBasedDataReader() reader = FileBasedDataReader()
bits = reader.read(path) bits = reader.read(path)
return [PymuDocDataset(bits)] return [PymuDocDataset(bits)]
def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]: def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
"""Read images from path or directory. """Read images from path or directory.
Args: Args:
...@@ -87,7 +90,7 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]: ...@@ -87,7 +90,7 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
for file in files: for file in files:
suffix = file.split('.') suffix = file.split('.')
if suffix[-1] in s_suffixes: if suffix[-1] in s_suffixes:
imgs_bits.append(reader.read(file)) imgs_bits.append(reader.read(os.path.join(root, file)))
return [ImageDataset(bits) for bits in imgs_bits] return [ImageDataset(bits) for bits in imgs_bits]
else: else:
reader = FileBasedDataReader() reader = FileBasedDataReader()
......
...@@ -70,6 +70,12 @@ Key Features ...@@ -70,6 +70,12 @@ Key Features
- Supports both CPU and GPU environments. - Supports both CPU and GPU environments.
- Compatible with Windows, Linux, and Mac platforms. - Compatible with Windows, Linux, and Mac platforms.
.. tip::
Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
User Guide User Guide
------------- -------------
.. toctree:: .. toctree::
......
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
:maxdepth: 2 :maxdepth: 2
user_guide/install user_guide/install
user_guide/usage
user_guide/quick_start user_guide/quick_start
user_guide/tutorial user_guide/tutorial
user_guide/data user_guide/data
user_guide/inference_result
user_guide/pipe_result
...@@ -125,16 +125,16 @@ Read Examples ...@@ -125,16 +125,16 @@ Read Examples
# s3 related # s3 related
s3_reader1 = S3DataReader( s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix" "test_prefix",
bucket: "test_bucket", "test_bucket",
ak: "ak", "ak",
sk: "sk", "sk",
endpoint_url: "localhost" "localhost"
) )
## will read s3://test_bucket/test_prefix/abc ## will read s3://test_bucket/test_prefix/abc
s3_reader1.read('abc') s3_reader1.read('abc')
## will read s3://test_bucket/efg ## will read s3://test_bucket/efg
s3_reader1.read('s3://test_bucket/efg') s3_reader1.read('s3://test_bucket/efg')
...@@ -188,11 +188,11 @@ Write Examples ...@@ -188,11 +188,11 @@ Write Examples
# s3 related # s3 related
s3_writer1 = S3DataWriter( s3_writer1 = S3DataWriter(
default_prefix_without_bucket = "test_prefix" "test_prefix",
bucket: "test_bucket", "test_bucket",
ak: "ak", "ak",
sk: "sk", "sk",
endpoint_url: "localhost" "localhost"
) )
## will write 123 to s3://test_bucket/test_prefix/abc ## will write 123 to s3://test_bucket/test_prefix/abc
......
...@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y ...@@ -18,24 +18,50 @@ Read the contet from jsonl which may located on local machine or remote s3. if y
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# read jsonl from local machine # read jsonl from local machine
datasets = read_jsonl("tt.jsonl", None) datasets = read_jsonl("tt.jsonl", None) # replace with real jsonl file
# read jsonl from remote s3 # read jsonl from remote s3
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
bucket = "bucket_1" # replace with real s3 bucket
ak = "access_key_1" # replace with real s3 access key
sk = "secret_key_1" # replace with real s3 secret key
endpoint_url = "endpoint_url_1" # replace with real s3 endpoint url
bucket_2 = "bucket_2" # replace with real s3 bucket
ak_2 = "access_key_2" # replace with real s3 access key
sk_2 = "secret_key_2" # replace with real s3 secret key
endpoint_url_2 = "endpoint_url_2" # replace with real s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # replace with real s3 jsonl file
read_local_pdfs read_local_pdfs
^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^
Read pdf from path or directory. Read pdf from path or directory.
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
# read pdf path # read pdf path
datasets = read_local_pdfs("tt.pdf") datasets = read_local_pdfs("tt.pdf")
...@@ -51,7 +77,7 @@ Read images from path or directory ...@@ -51,7 +77,7 @@ Read images from path or directory
.. code:: python .. code:: python
from magic_pdf.data.io.read_api import * from magic_pdf.data.read_api import *
# read from image path # read from image path
datasets = read_local_images("tt.png") datasets = read_local_images("tt.png")
......
Inference Result
==================
...@@ -8,5 +8,5 @@ Installation ...@@ -8,5 +8,5 @@ Installation
install/install install/install
install//boost_with_cuda install//boost_with_cuda
install/download_model_weight_files install/download_model_weight_files
install/config
...@@ -9,25 +9,7 @@ appropriate guide based on your system: ...@@ -9,25 +9,7 @@ appropriate guide based on your system:
- :ref:`ubuntu_22_04_lts_section` - :ref:`ubuntu_22_04_lts_section`
- :ref:`windows_10_or_11_section` - :ref:`windows_10_or_11_section`
- Quick Deployment with Docker
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
.. _ubuntu_22_04_lts_section: .. _ubuntu_22_04_lts_section:
......
Config
=========
File **magic-pdf.json** is typically located in the **${HOME}** directory under a Linux system or in the **C:\Users\{username}** directory under a Windows system.
magic-pdf.json
----------------
.. code:: json
{
"bucket_info":{
"bucket-name-1":["ak", "sk", "endpoint"],
"bucket-name-2":["ak", "sk", "endpoint"]
},
"models-dir":"/tmp/models",
"layoutreader-model-dir":"/tmp/layoutreader",
"device-mode":"cpu",
"layout-config": {
"model": "layoutlmv3"
},
"formula-config": {
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
},
"table-config": {
"model": "rapid_table",
"enable": false,
"max_time": 400
},
"config_version": "1.0.0"
}
bucket_info
^^^^^^^^^^^^^^
Store the access_key, secret_key and endpoint of AWS S3 Compatible storage config
Example:
.. code:: text
{
"image_bucket":[{access_key}, {secret_key}, {endpoint}],
"video_bucket":[{access_key}, {secret_key}, {endpoint}]
}
models-dir
^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
layoutreader-model-dir
^^^^^^^^^^^^^^^^^^^^^^^
Store the models download from **huggingface** or **modelshop**. You do not need to modify this field if you download the model using the scripts shipped with **MinerU**
devide-mode
^^^^^^^^^^^^^^
This field have two options, **cpu** or **cuda**.
**cpu**: inference via cpu
**cuda**: using cuda to accelerate inference
layout-config
^^^^^^^^^^^^^^^
.. code:: json
{
"model": "layoutlmv3"
}
layout model can not be disabled now, And we have only kind of layout model currently.
formula-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"mfd_model": "yolo_v8_mfd",
"mfr_model": "unimernet_small",
"enable": true
}
mfd_model
""""""""""
Specify the formula detection model, options are ['yolo_v8_mfd']
mfr_model
""""""""""
Specify the formula recognition model, options are ['unimernet_small']
Check `UniMERNet <https://github.com/opendatalab/UniMERNet>`_ for more details
enable
""""""""
on-off flag, options are [true, false]. **true** means enable formula inference, **false** means disable formula inference
table-config
^^^^^^^^^^^^^^^^
.. code:: json
{
"model": "rapid_table",
"enable": false,
"max_time": 400
}
model
""""""""
Specify the table inference model, options are ['rapid_table', 'tablemaster', 'struct_eqtable']
max_time
"""""""""
Since table recognition is a time-consuming process, we set a timeout period. If the process exceeds this time, the table recognition will be terminated.
enable
"""""""
on-off flag, options are [true, false]. **true** means enable table inference, **false** means disable table inference
config_version
^^^^^^^^^^^^^^^^
The version of config schema.
Check `Config Schema <https://github.com/opendatalab/MinerU/blob/master/magic-pdf.template.json>`_ for the latest config schema.
\ No newline at end of file
...@@ -4,6 +4,7 @@ Install ...@@ -4,6 +4,7 @@ Install
If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`. If you encounter any installation issues, please first consult the :doc:`../../additional_notes/faq`.
If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`. If the parsing results are not as expected, refer to the :doc:`../../additional_notes/known_issues`.
Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ without installation.
.. admonition:: Warning .. admonition:: Warning
:class: tip :class: tip
...@@ -107,4 +108,6 @@ Download model weight files ...@@ -107,4 +108,6 @@ Download model weight files
python download_models_hf.py python download_models_hf.py
The MinerU is installed, Check out :doc:`../quick_start` or reading :doc:`boost_with_cuda` for accelerate inference .. tip::
\ No newline at end of file
The MinerU is installed, Check out :doc:`../quick_start/command_line` to convert your first pdf **or** reading the following sections for more details about install
...@@ -2,12 +2,14 @@ ...@@ -2,12 +2,14 @@
Quick Start Quick Start
============== ==============
Eager to get started? This page gives a good introduction to MinerU. Follow Installation to set up a project and install MinerU first. Want to learn about the usage methods under different scenarios ? This page gives good examples about multiple usage cases match your needs.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
quick_start/command_line quick_start/convert_pdf
quick_start/to_markdown quick_start/convert_images
quick_start/convert_ppt
quick_start/convert_word
quick_start/convert_directory
Convert Files Under Directory
=================================
.. code:: python
Usage
========
.. toctree::
:maxdepth: 1
usage/command_line
usage/api
usage/docker
Api Usage
Convert To Markdown ===========
========================
Local File Example Local File Example
^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^
......
...@@ -57,6 +57,6 @@ directory. The output file list is as follows: ...@@ -57,6 +57,6 @@ directory. The output file list is as follows:
.. admonition:: Tip .. admonition:: Tip
:class: tip :class: tip
For more information about the output files, please refer to the :doc:`../tutorial/output_file_description` For more information about the output files, please refer to the :doc:`TODO: modify link <../tutorial/output_file_description>`
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment