Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
1b654fc2
"vscode:/vscode.git/clone" did not exist on "0ee9f56e72fa016eeeba132f2d4c7e08ecf0ac2d"
Unverified
Commit
1b654fc2
authored
Jan 09, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jan 09, 2025
Browse files
Merge branch 'master' into release-1.0.0
parents
7ed89fb9
a53a467a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
55 additions
and
11 deletions
+55
-11
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+31
-11
signatures/version1/cla.json
signatures/version1/cla.json
+24
-0
No files found.
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
View file @
1b654fc2
...
@@ -12,6 +12,7 @@
...
@@ -12,6 +12,7 @@
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
pdf_file_name = "abc.pdf" # replace with the real pdf path
...
@@ -36,15 +37,22 @@
...
@@ -36,15 +37,22 @@
## Create Dataset Instance
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds = PymuDocDataset(pdf_bytes)
## inference
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...
@@ -54,6 +62,9 @@
...
@@ -54,6 +62,9 @@
### dump markdown
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
对象存储文件示例
对象存储文件示例
^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^
...
@@ -92,23 +103,32 @@
...
@@ -92,23 +103,32 @@
## Create Dataset Instance
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
ds = PymuDocDataset(pdf_bytes)
## inference
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
signatures/version1/cla.json
View file @
1b654fc2
...
@@ -103,6 +103,30 @@
...
@@ -103,6 +103,30 @@
"created_at"
:
"2024-11-19T07:28:12Z"
,
"created_at"
:
"2024-11-19T07:28:12Z"
,
"repoId"
:
765083837
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1024
"pullRequestNo"
:
1024
},
{
"name"
:
"MatthewZMD"
,
"id"
:
12422335
,
"comment_id"
:
2565021810
,
"created_at"
:
"2024-12-30T04:46:33Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1379
},
{
"name"
:
"yzztin"
,
"id"
:
99233593
,
"comment_id"
:
2568773016
,
"created_at"
:
"2025-01-03T07:02:55Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1397
},
{
"name"
:
"utopia2077"
,
"id"
:
78017255
,
"comment_id"
:
2571704177
,
"created_at"
:
"2025-01-05T17:57:17Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1412
}
}
]
]
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment