Commit 26200c4a authored by icecraft's avatar icecraft
Browse files

fix: remove deprecated demo code

parent 9ec5afaf
...@@ -14,6 +14,7 @@ Local File Example ...@@ -14,6 +14,7 @@ Local File Example
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args # args
pdf_file_name = "abc.pdf" # replace with the real pdf path pdf_file_name = "abc.pdf" # replace with the real pdf path
...@@ -38,15 +39,22 @@ Local File Example ...@@ -38,15 +39,22 @@ Local File Example
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
## inference ## inference
infer_result = ds.apply(doc_analyze, ocr=True) if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page ### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page ### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...@@ -56,6 +64,9 @@ Local File Example ...@@ -56,6 +64,9 @@ Local File Example
### dump markdown ### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
S3 File Example S3 File Example
^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
...@@ -94,23 +105,32 @@ S3 File Example ...@@ -94,23 +105,32 @@ S3 File Example
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
## inference ## inference
infer_result = ds.apply(doc_analyze, ocr=True) if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page ### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page ### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page ### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown ### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3 pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details Check :doc:`../data/data_reader_writer` for more [reader | writer] examples and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# args # args
pdf_file_name = "abc.pdf" # replace with the real pdf path pdf_file_name = "abc.pdf" # replace with the real pdf path
...@@ -36,15 +37,22 @@ ...@@ -36,15 +37,22 @@
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
## inference ## inference
infer_result = ds.apply(doc_analyze, ocr=True) if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page ### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf")) infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page ### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf")) pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
...@@ -54,6 +62,9 @@ ...@@ -54,6 +62,9 @@
### dump markdown ### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir) pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
对象存储文件示例 对象存储文件示例
^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
...@@ -92,23 +103,32 @@ ...@@ -92,23 +103,32 @@
## Create Dataset Instance ## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes)
## inference ## inference
infer_result = ds.apply(doc_analyze, ocr=True) if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
## pipeline
pipe_result = infer_result.pipe_txt_mode(image_writer)
### draw model result on each page ### draw model result on each page
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page ### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
### draw spans result on each page ### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
### dump markdown ### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3 pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例 前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment