Commit 6ca86bea authored by xu rui's avatar xu rui
Browse files

docs: rewrite install and usage docs

parent fd2f3c58
Docker
=======
.. admonition:: Important
:class: tip
Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
.. code-block:: bash
bash docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
......@@ -111,11 +111,11 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
# S3 相关的
s3_reader1 = S3DataReader(
default_prefix_without_bucket = "test_prefix",
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
)
## 将读取 s3://test_bucket/test_prefix/abc
......@@ -172,11 +172,11 @@ S3DataReader 基于 MultiBucketS3DataReader 构建,但仅支持单个桶。S3D
# S3 相关的
s3_writer1 = S3DataWriter(
default_prefix_without_bucket = "test_prefix",
bucket: "test_bucket",
ak: "ak",
sk: "sk",
endpoint_url: "localhost"
"test_prefix",
"test_bucket",
"ak",
"sk",
"localhost"
)
## 将写入 123 到 s3://test_bucket/test_prefix/abc
......
......@@ -15,13 +15,41 @@ read_jsonl
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
from magic_pdf.data.data_reader_writer import MultiBucketS3DataReader
from magic_pdf.data.schemas import S3Config
# 从本地机器读取 JSONL
datasets = read_jsonl("tt.jsonl", None)
# 读取本地 jsonl 文件
datasets = read_jsonl("tt.jsonl", None) # 替换为有效的文件
# 读取 s3 jsonl 文件
bucket = "bucket_1" # 替换为有效的 s3 bucket
ak = "access_key_1" # 替换为有效的 s3 access key
sk = "secret_key_1" # 替换为有效的 s3 secret key
endpoint_url = "endpoint_url_1" # 替换为有效的 s3 endpoint url
bucket_2 = "bucket_2" # 替换为有效的 s3 bucket
ak_2 = "access_key_2" # 替换为有效的 s3 access key
sk_2 = "secret_key_2" # 替换为有效的 s3 secret key
endpoint_url_2 = "endpoint_url_2" # 替换为有效的 s3 endpoint url
s3configs = [
S3Config(
bucket_name=bucket, access_key=ak, secret_key=sk, endpoint_url=endpoint_url
),
S3Config(
bucket_name=bucket_2,
access_key=ak_2,
secret_key=sk_2,
endpoint_url=endpoint_url_2,
),
]
s3_reader = MultiBucketS3DataReader(bucket, s3configs)
datasets = read_jsonl(f"s3://bucket_1/tt.jsonl", s3_reader) # 替换为有效的 s3 jsonl file
# 从远程 S3 读取 JSONL
datasets = read_jsonl("s3://bucket_1/tt.jsonl", s3_reader)
read_local_pdfs
^^^^^^^^^^^^^^^^
......@@ -30,7 +58,7 @@ read_local_pdfs
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# 读取 PDF 路径
datasets = read_local_pdfs("tt.pdf")
......@@ -45,7 +73,7 @@ read_local_images
.. code:: python
from magic_pdf.data.io.read_api import *
from magic_pdf.data.read_api import *
# 从图像路径读取
datasets = read_local_images("tt.png")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment