Merge pull request #1490 from opendatalab/master

master->dev

Merge pull request #1490 from opendatalab/master
master->dev
25931417 · Xiaomeng Zhao · GitHub · a636209b · 2c4a586e · 25931417
Unverified Commit 25931417 authored Jan 10, 2025 by Xiaomeng Zhao Committed by GitHub Jan 10, 2025
3 changed files
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
-__version__ = "0.10.6"
+__version__ = "1.0.0"
--- a/next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+++ b/next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
@@ -12,6 +12,7 @@
    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.config.enums import SupportedPdfParseMethod

    # args
    pdf_file_name = "abc.pdf"  # replace with the real pdf path
@@ -36,15 +37,22 @@
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)

-    ## inference 
-    infer_result = ds.apply(doc_analyze, ocr=True)
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
+        infer_result = ds.apply(doc_analyze, ocr=True)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
+
+    else:
+        infer_result = ds.apply(doc_analyze, ocr=False)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_txt_mode(image_writer)

    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))

-    ## pipeline
-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))

@@ -54,6 +62,9 @@
    ### dump markdown
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)

+    ### dump content list
+    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
+

 对象存储文件示例
 ^^^^^^^^^^^^^^^^
@@ -92,23 +103,32 @@
    ## Create Dataset Instance
    ds = PymuDocDataset(pdf_bytes)

-    ## inference 
-    infer_result = ds.apply(doc_analyze, ocr=True)
+    ## inference
+    if ds.classify() == SupportedPdfParseMethod.OCR:
+        infer_result = ds.apply(doc_analyze, ocr=True)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_ocr_mode(image_writer)
+
+    else:
+        infer_result = ds.apply(doc_analyze, ocr=False)
+
+        ## pipeline
+        pipe_result = infer_result.pipe_txt_mode(image_writer)

    ### draw model result on each page
    infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf'))  # dump to local

-    ## pipeline
-    pipe_result = infer_result.pipe_ocr_mode(image_writer)
-
    ### draw layout result on each page
    pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf'))  # dump to local

    ### draw spans result on each page
-    pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf'))  # dump to local 
+    pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf'))  # dump to local

    ### dump markdown
    pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images")  # dump to remote s3

+    ### dump content list
+    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

 前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
--- a/signatures/version1/cla.json
+++ b/signatures/version1/cla.json
@@ -103,6 +103,38 @@
      "created_at": "2024-11-19T07:28:12Z",
      "repoId": 765083837,
      "pullRequestNo": 1024
+    },
+    {
+      "name": "MatthewZMD",
+      "id": 12422335,
+      "comment_id": 2565021810,
+      "created_at": "2024-12-30T04:46:33Z",
+      "repoId": 765083837,
+      "pullRequestNo": 1379
+    },
+    {
+      "name": "yzztin",
+      "id": 99233593,
+      "comment_id": 2568773016,
+      "created_at": "2025-01-03T07:02:55Z",
+      "repoId": 765083837,
+      "pullRequestNo": 1397
+    },
+    {
+      "name": "utopia2077",
+      "id": 78017255,
+      "comment_id": 2571704177,
+      "created_at": "2025-01-05T17:57:17Z",
+      "repoId": 765083837,
+      "pullRequestNo": 1412
+    },
+    {
+      "name": "beholder91",
+      "id": 113708464,
+      "comment_id": 2581919559,
+      "created_at": "2025-01-10T06:58:05Z",
+      "repoId": 765083837,
+      "pullRequestNo": 1479
    }
  ]
 }
\ No newline at end of file