Commit a4b29f89 authored by icecraft's avatar icecraft
Browse files

feat: add s3 example

parent 2f0e5b2a
......@@ -3,12 +3,16 @@
Convert To Markdown
========================
Local File Example
^^^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
......@@ -23,7 +27,7 @@ Convert To Markdown
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
) # create 00
)
image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("")
......@@ -49,4 +53,50 @@ Convert To Markdown
md_writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
S3 File Example
^^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
......@@ -3,12 +3,16 @@
转换为 Markdown 文件
========================
本地文件示例
^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
......@@ -23,7 +27,7 @@
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
) # create 00
)
image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("")
......@@ -49,5 +53,51 @@
md_writer.write_string(f"{pdf_file_name}.md", md_content)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
对象存储使用示例
^^^^^^^^^^^^^^^
.. code:: python
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
model_list = []
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown(
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
)
if isinstance(md_content, list):
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment