Commit 52efe94d authored by myhloli's avatar myhloli
Browse files

feat(api): simplify markdown and content list generation

- Remove DropMode and MakeMode imports from user code
- Set default drop_mode to DropMode.NONE in get_markdown and get_content_list methods
- Remove md_make_mode parameter from get_content_list method
- Add dump_middle_json method to PipeResult
- Update examples in API documentation and demo script
parent 15db6fe9
...@@ -5,7 +5,6 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData ...@@ -5,7 +5,6 @@ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedData
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
# args # args
pdf_file_name = "demo1.pdf" # replace with the real pdf path pdf_file_name = "demo1.pdf" # replace with the real pdf path
...@@ -54,17 +53,20 @@ pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout. ...@@ -54,17 +53,20 @@ pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.
### draw spans result on each page ### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### dump markdown ### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### dump content list ### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
### get middle json ### get middle json
middle_json_content = pipe_result.get_middle_json() middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -26,14 +26,14 @@ class PipeResult: ...@@ -26,14 +26,14 @@ class PipeResult:
def get_markdown( def get_markdown(
self, self,
img_dir_or_bucket_prefix: str, img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF, drop_mode=DropMode.NONE,
md_make_mode=MakeMode.MM_MD, md_make_mode=MakeMode.MM_MD,
) -> str: ) -> str:
"""Get markdown content. """Get markdown content.
Args: Args:
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
Returns: Returns:
...@@ -50,7 +50,7 @@ class PipeResult: ...@@ -50,7 +50,7 @@ class PipeResult:
writer: DataWriter, writer: DataWriter,
file_path: str, file_path: str,
img_dir_or_bucket_prefix: str, img_dir_or_bucket_prefix: str,
drop_mode=DropMode.WHOLE_PDF, drop_mode=DropMode.NONE,
md_make_mode=MakeMode.MM_MD, md_make_mode=MakeMode.MM_MD,
): ):
"""Dump The Markdown. """Dump The Markdown.
...@@ -59,7 +59,7 @@ class PipeResult: ...@@ -59,7 +59,7 @@ class PipeResult:
writer (DataWriter): File writer handle writer (DataWriter): File writer handle
file_path (str): The file location of markdown file_path (str): The file location of markdown
img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure img_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.WHOLE_PDF. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD. md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.MM_MD.
""" """
...@@ -72,14 +72,12 @@ class PipeResult: ...@@ -72,14 +72,12 @@ class PipeResult:
self, self,
image_dir_or_bucket_prefix: str, image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE, drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT,
) -> str: ) -> str:
"""Get Content List. """Get Content List.
Args: Args:
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
Returns: Returns:
str: content list content str: content list content
...@@ -87,7 +85,7 @@ class PipeResult: ...@@ -87,7 +85,7 @@ class PipeResult:
pdf_info_list = self._pipe_res['pdf_info'] pdf_info_list = self._pipe_res['pdf_info']
content_list = union_make( content_list = union_make(
pdf_info_list, pdf_info_list,
md_make_mode, MakeMode.STANDARD_FORMAT,
drop_mode, drop_mode,
image_dir_or_bucket_prefix, image_dir_or_bucket_prefix,
) )
...@@ -99,7 +97,6 @@ class PipeResult: ...@@ -99,7 +97,6 @@ class PipeResult:
file_path: str, file_path: str,
image_dir_or_bucket_prefix: str, image_dir_or_bucket_prefix: str,
drop_mode=DropMode.NONE, drop_mode=DropMode.NONE,
md_make_mode=MakeMode.STANDARD_FORMAT,
): ):
"""Dump Content List. """Dump Content List.
...@@ -108,10 +105,9 @@ class PipeResult: ...@@ -108,10 +105,9 @@ class PipeResult:
file_path (str): The file location of content list file_path (str): The file location of content list
image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure image_dir_or_bucket_prefix (str): The s3 bucket prefix or local file directory which used to store the figure
drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE. drop_mode (str, optional): Drop strategy when some page which is corrupted or inappropriate. Defaults to DropMode.NONE.
md_make_mode (str, optional): The content Type of Markdown be made. Defaults to MakeMode.STANDARD_FORMAT.
""" """
content_list = self.get_content_list( content_list = self.get_content_list(
image_dir_or_bucket_prefix, drop_mode=drop_mode, md_make_mode=md_make_mode image_dir_or_bucket_prefix, drop_mode=drop_mode,
) )
writer.write_string( writer.write_string(
file_path, json.dumps(content_list, ensure_ascii=False, indent=4) file_path, json.dumps(content_list, ensure_ascii=False, indent=4)
......
...@@ -17,7 +17,6 @@ Local File Example ...@@ -17,7 +17,6 @@ Local File Example
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
# args # args
pdf_file_name = "abc.pdf" # replace with the real pdf path pdf_file_name = "abc.pdf" # replace with the real pdf path
...@@ -66,21 +65,24 @@ Local File Example ...@@ -66,21 +65,24 @@ Local File Example
### draw spans result on each page ### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf")) pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
### get markdown content
md_content = pipe_result.get_markdown(image_dir)
### dump markdown ### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir)
### dump content list ### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD)
### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT)
### get middle json ### get middle json
middle_json_content = pipe_result.get_middle_json() middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
S3 File Example S3 File Example
...@@ -93,7 +95,6 @@ S3 File Example ...@@ -93,7 +95,6 @@ S3 File Example
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
...@@ -157,15 +158,16 @@ S3 File Example ...@@ -157,15 +158,16 @@ S3 File Example
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir) pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
### get markdown content ### get markdown content
md_content = pipe_result.get_markdown(image_dir, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) md_content = pipe_result.get_markdown(image_dir)
### get content list content ### get content list content
content_list_content = pipe_result.get_content_list(image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.STANDARD_FORMAT) content_list_content = pipe_result.get_content_list(image_dir)
### get middle json ### get middle json
middle_json_content = pipe_result.get_middle_json() middle_json_content = pipe_result.get_middle_json()
### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')
MS-Office MS-Office
---------- ----------
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment