Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
58b2e78d
Unverified
Commit
58b2e78d
authored
Dec 20, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 20, 2024
Browse files
Merge pull request #1336 from icecraft/docs/add_more_method_usage
docs: add more method description
parents
a9dea5f0
24ee9c41
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
3 deletions
+34
-3
next_docs/en/user_guide/usage/api.rst
next_docs/en/user_guide/usage/api.rst
+34
-3
No files found.
next_docs/en/user_guide/usage/api.rst
View file @
58b2e78d
...
@@ -17,6 +17,7 @@ Local File Example
...
@@ -17,6 +17,7 @@ Local File Example
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
# args
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
pdf_file_name = "abc.pdf" # replace with the real pdf path
...
@@ -31,7 +32,6 @@ Local File Example
...
@@ -31,7 +32,6 @@ Local File Example
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
local_md_dir
local_md_dir
)
)
image_dir = str(os.path.basename(local_image_dir))
# read bytes
# read bytes
reader1 = FileBasedDataReader("")
reader1 = FileBasedDataReader("")
...
@@ -57,6 +57,9 @@ Local File Example
...
@@ -57,6 +57,9 @@ Local File Example
### draw model result on each page
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...
@@ -69,6 +72,16 @@ Local File Example
...
@@ -69,6 +72,16 @@ Local File Example
### dump content list
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
### get middle json
middle_json_content = pipe_result.get_middle_json()
S3 File Example
S3 File Example
^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^
...
@@ -80,20 +93,25 @@ S3 File Example
...
@@ -80,20 +93,25 @@ S3 File Example
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.enums import SupportedPdfParseMethod
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
ak = "{Your S3 access key}" # replace with real s3 access key
sk = "{Your S3 secret key}" # replace with real s3 secret key
sk = "{Your S3 secret key}" # replace with real s3 secret key
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
endpoint_url = "{Your S3 endpoint_url}" # replace with real s3 endpoint_url
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
reader = S3DataReader('unittest/tmp/', bucket_name, ak, sk, endpoint_url) # replace `unittest/tmp` with the real s3 prefix
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
md_writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
# args
# args
pdf_file_name = (
pdf_file_name = (
"s3://
llm-pdf-text-1
/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
f
"s3://
{bucket_name}
/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
)
# prepare env
# prepare env
...
@@ -123,6 +141,9 @@ S3 File Example
...
@@ -123,6 +141,9 @@ S3 File Example
### draw model result on each page
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### get model inference result
model_inference_result = infer_result.get_infer_res()
### draw layout result on each page
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...
@@ -135,6 +156,16 @@ S3 File Example
...
@@ -135,6 +156,16 @@ S3 File Example
### dump content list
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
### get middle json
middle_json_content = pipe_result.get_middle_json()
MS-Office
MS-Office
----------
----------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment