feat(api): simplify markdown and content list generation

- Remove DropMode and MakeMode imports from user code - Set default drop_mode to DropMode.NONE in get_markdown and get_content_list methods - Remove md_make_mode parameter from get_content_list method - Add dump_middle_json method to PipeResult - Update examples in API documentation and demo script

feat(api): simplify markdown and content list generation
- Remove DropMode and MakeMode imports from user code - Set default drop_mode to DropMode.NONE in get_markdown and get_content_list methods - Remove md_make_mode parameter from get_content_list method - Add dump_middle_json method to PipeResult - Update examples in API documentation and demo script
52efe94d · myhloli · 15db6fe9 · 52efe94d · 15db6fe9 · 15db6fe9
Commit 52efe94d authored Jan 07, 2025 by myhloli
6 changed files
--- a/demo/demo.py
+++ b/demo/demo.py
@@ -5,7 +5,6 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.config.enums import SupportedPdfParseMethod
-from magic_pdf.config.make_content_config import DropMode, MakeMode
 # args
 pdf_file_name = "demo1.pdf"  # replace with the real pdf path
@@ -54,17 +53,20 @@ pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.
 ### draw spans result on each page
 pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
+### get markdown content
+md_content = pipe_result.get_markdown(image_dir)
 ### dump markdown
 pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+### get content list content
+content_list_content = pipe_result.get_content_list(image_dir)
 ### dump content list
 pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-### get markdown content
-md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD)
-### get content list content
-content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
 ### get middle json
 middle_json_content = pipe_result.get_middle_json()
+### dump middle json
+pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
--- a/demo/demo1.json
+++ b/demo/demo1.json
--- a/demo/demo2.json
+++ b/demo/demo2.json
--- a/demo/small_ocr.json
+++ b/demo/small_ocr.json
--- a/magic_pdf/operators/pipes.py
+++ b/magic_pdf/operators/pipes.py
@@ -26,14 +26,14 @@ class PipeResult:
    def get_markdown(
        self,
        img_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.WHOLE_PDF,
+        drop_mode=DropMode.NONE,
        md_make_mode=MakeMode.MM_MD,
    ) -> str:
        """Get markdown content.
        Args:
            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
        Returns:
@@ -50,7 +50,7 @@ class PipeResult:
        writer: DataWriter,
        file_path: str,
        img_dir_or_bucket_prefix: str,
-        drop_mode=DropMode.WHOLE_PDF,
+        drop_mode=DropMode.NONE,
        md_make_mode=MakeMode.MM_MD,
    ):
        """Dump The Markdown.
@@ -59,7 +59,7 @@ class PipeResult:
            writer (DataWriter): File writer handle
            file_path (str): The file location of markdown
            img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
-            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF.
+            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
        """
@@ -72,14 +72,12 @@ class PipeResult:
        self,
        image_dir_or_bucket_prefix: str,
        drop_mode=DropMode.NONE,
-        md_make_mode=MakeMode.STANDARD_FORMAT,
    ) -> str:
        """Get Content List.
        Args:
            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
        Returns:
            str: content list content
@@ -87,7 +85,7 @@ class PipeResult:
        pdf_info_list = self._pipe_res['pdf_info']
        content_list = union_make(
            pdf_info_list,
-            md_make_mode,
+            MakeMode.STANDARD_FORMAT,
            drop_mode,
            image_dir_or_bucket_prefix,
        )
@@ -99,7 +97,6 @@ class PipeResult:
        file_path: str,
        image_dir_or_bucket_prefix: str,
        drop_mode=DropMode.NONE,
-        md_make_mode=MakeMode.STANDARD_FORMAT,
    ):
        """Dump Content List.
@@ -108,10 +105,9 @@ class PipeResult:
            file_path (str): The file location of content list
            image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
            drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
-            md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
        """
        content_list = self.get_content_list(
-            image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode
+            image_dir_or_bucket_prefix, drop_mode=drop_mode,
        )
        writer.write_string(
            file_path, json.dumps(content_list, ensure_ascii=False, indent=4)

--- a/next_docs/en/user_guide/usage/api.rst
+++ b/next_docs/en/user_guide/usage/api.rst
@@ -17,7 +17,6 @@ Local File Example
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
    from magic_pdf.config.enums import SupportedPdfParseMethod
-    from magic_pdf.config.make_content_config import DropMode, MakeMode
    # args
    pdf_file_name = "abc.pdf"  # replace with the real pdf path
@@ -66,21 +65,24 @@ Local File Example
    ### draw spans result on each page
    pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
+    ### get markdown content
+    md_content = pipe_result.get_markdown(image_dir)
    ### dump markdown
    pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+    ### get content list content
+    content_list_content = pipe_result.get_content_list(image_dir)
    ### dump content list
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
-    ### get markdown content
-    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
-    ### get content list content
-    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 
    ### get middle json
    middle_json_content = pipe_result.get_middle_json()
+    ### dump middle json
+    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
 S3 File Example
@@ -93,7 +95,6 @@ S3 File Example
    from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
    from magic_pdf.data.dataset import PymuDocDataset
    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-    from magic_pdf.config.make_content_config import DropMode, MakeMode
    from magic_pdf.config.enums import SupportedPdfParseMethod
    bucket_name = "{Your S3 Bucket Name}"  # replace with real bucket name
@@ -157,15 +158,16 @@ S3 File Example
    pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
    ### get markdown content
-    md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
+    md_content = pipe_result.get_markdown(image_dir)
    ### get content list content
-    content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) 
+    content_list_content = pipe_result.get_content_list(image_dir)
    ### get middle json
    middle_json_content = pipe_result.get_middle_json()
+    ### dump middle json
+    pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
 MS-Office
 ----------