Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
25931417
"vscode:/vscode.git/clone" did not exist on "af5105c7b1111e2031e2d44d1bede83d30bd32be"
Unverified
Commit
25931417
authored
Jan 10, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jan 10, 2025
Browse files
Merge pull request #1490 from opendatalab/master
master->dev
parents
a636209b
2c4a586e
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
64 additions
and
12 deletions
+64
-12
magic_pdf/libs/version.py
magic_pdf/libs/version.py
+1
-1
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+31
-11
signatures/version1/cla.json
signatures/version1/cla.json
+32
-0
No files found.
magic_pdf/libs/version.py
View file @
25931417
__version__
=
"
0.10.6
"
__version__
=
"
1.0.0
"
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
View file @
25931417
...
...
@@ -12,6 +12,7 @@
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args
pdf_file_name = "abc.pdf" # replace with the real pdf path
...
...
@@ -37,14 +38,21 @@
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...
...
@@ -54,6 +62,9 @@
### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
对象存储文件示例
^^^^^^^^^^^^^^^^
...
...
@@ -93,14 +104,21 @@
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
...
...
@@ -110,5 +128,7 @@
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
signatures/version1/cla.json
View file @
25931417
...
...
@@ -103,6 +103,38 @@
"created_at"
:
"2024-11-19T07:28:12Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1024
},
{
"name"
:
"MatthewZMD"
,
"id"
:
12422335
,
"comment_id"
:
2565021810
,
"created_at"
:
"2024-12-30T04:46:33Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1379
},
{
"name"
:
"yzztin"
,
"id"
:
99233593
,
"comment_id"
:
2568773016
,
"created_at"
:
"2025-01-03T07:02:55Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1397
},
{
"name"
:
"utopia2077"
,
"id"
:
78017255
,
"comment_id"
:
2571704177
,
"created_at"
:
"2025-01-05T17:57:17Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1412
},
{
"name"
:
"beholder91"
,
"id"
:
113708464
,
"comment_id"
:
2581919559
,
"created_at"
:
"2025-01-10T06:58:05Z"
,
"repoId"
:
765083837
,
"pullRequestNo"
:
1479
}
]
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment