Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
f6bd47de
"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "6de7021e6be4bff0aa9c3078db999a82d96b1e3a"
Commit
f6bd47de
authored
Dec 02, 2024
by
xu rui
Browse files
docs: add dataset method description
parent
d44e7a28
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
198 additions
and
91 deletions
+198
-91
magic_pdf/data/dataset.py
magic_pdf/data/dataset.py
+79
-7
magic_pdf/libs/draw_bbox.py
magic_pdf/libs/draw_bbox.py
+0
-1
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+0
-3
magic_pdf/pipe/operators.py
magic_pdf/pipe/operators.py
+14
-0
next_docs/en/user_guide/quick_start/to_markdown.rst
next_docs/en/user_guide/quick_start/to_markdown.rst
+52
-38
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
+53
-42
No files found.
magic_pdf/data/dataset.py
View file @
f6bd47de
...
@@ -36,7 +36,7 @@ class PageableData(ABC):
...
@@ -36,7 +36,7 @@ class PageableData(ABC):
Args:
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which descri
pt
the RGB of the board line, None means no board line
color (list[float] | None): three element tuple which descri
be
the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
width (float): the width of board
...
@@ -52,7 +52,7 @@ class PageableData(ABC):
...
@@ -52,7 +52,7 @@ class PageableData(ABC):
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
content (str): the text content
fontsize (int): font size of the text
fontsize (int): font size of the text
color (list[float] | None): three element tuple which descri
pt
the RGB of the board line, None will use the default font color!
color (list[float] | None): three element tuple which descri
be
the RGB of the board line, None will use the default font color!
"""
"""
pass
pass
...
@@ -96,14 +96,39 @@ class Dataset(ABC):
...
@@ -96,14 +96,39 @@ class Dataset(ABC):
@
abstractmethod
@
abstractmethod
def
dump_to_file
(
self
,
file_path
:
str
):
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file
Args:
file_path (str): the file path
"""
pass
pass
@
abstractmethod
@
abstractmethod
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
pass
@
abstractmethod
@
abstractmethod
def
classify
(
self
)
->
SupportedPdfParseMethod
:
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset
Returns:
SupportedPdfParseMethod: _description_
"""
pass
@
abstractmethod
def
clone
(
self
):
"""clone this dataset
"""
pass
pass
...
@@ -151,18 +176,42 @@ class PymuDocDataset(Dataset):
...
@@ -151,18 +176,42 @@ class PymuDocDataset(Dataset):
return
self
.
_records
[
page_id
]
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
new_args
=
tuple
([
self
]
+
list
(
args
))
"""Apply callable method which.
return
proc
(
*
new_args
,
**
kwargs
)
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset
Returns:
SupportedPdfParseMethod: _description_
"""
return
classify
(
self
.
_data_bits
)
return
classify
(
self
.
_data_bits
)
def
clone
(
self
):
"""clone this dataset
"""
return
PymuDocDataset
(
self
.
_raw_data
)
class
ImageDataset
(
Dataset
):
class
ImageDataset
(
Dataset
):
def
__init__
(
self
,
bits
:
bytes
):
def
__init__
(
self
,
bits
:
bytes
):
...
@@ -209,17 +258,40 @@ class ImageDataset(Dataset):
...
@@ -209,17 +258,40 @@ class ImageDataset(Dataset):
return
self
.
_records
[
page_id
]
return
self
.
_records
[
page_id
]
def
dump_to_file
(
self
,
file_path
:
str
):
def
dump_to_file
(
self
,
file_path
:
str
):
"""Dump the file
Args:
file_path (str): the file path
"""
dir_name
=
os
.
path
.
dirname
(
file_path
)
dir_name
=
os
.
path
.
dirname
(
file_path
)
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
if
dir_name
not
in
(
''
,
'.'
,
'..'
):
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
os
.
makedirs
(
dir_name
,
exist_ok
=
True
)
self
.
_raw_fitz
.
save
(
file_path
)
self
.
_raw_fitz
.
save
(
file_path
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(dataset, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
self
,
*
args
,
**
kwargs
)
return
proc
(
self
,
*
args
,
**
kwargs
)
def
classify
(
self
)
->
SupportedPdfParseMethod
:
def
classify
(
self
)
->
SupportedPdfParseMethod
:
"""classify the dataset
Returns:
SupportedPdfParseMethod: _description_
"""
return
SupportedPdfParseMethod
.
OCR
return
SupportedPdfParseMethod
.
OCR
def
clone
(
self
):
"""clone this dataset
"""
return
ImageDataset
(
self
.
_raw_data
)
class
Doc
(
PageableData
):
class
Doc
(
PageableData
):
"""Initialized with pymudoc object."""
"""Initialized with pymudoc object."""
...
@@ -228,7 +300,7 @@ class Doc(PageableData):
...
@@ -228,7 +300,7 @@ class Doc(PageableData):
self
.
_doc
=
doc
self
.
_doc
=
doc
def
get_image
(
self
):
def
get_image
(
self
):
"""Return the imge info.
"""Return the im
a
ge info.
Returns:
Returns:
dict: {
dict: {
...
@@ -266,7 +338,7 @@ class Doc(PageableData):
...
@@ -266,7 +338,7 @@ class Doc(PageableData):
Args:
Args:
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
color (list[float] | None): three element tuple which descri
pt
the RGB of the board line, None means no board line
color (list[float] | None): three element tuple which descri
be
the RGB of the board line, None means no board line
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill (list[float] | None): fill the board with RGB, None means will not fill with color
fill_opacity (float): opacity of the fill, range from [0, 1]
fill_opacity (float): opacity of the fill, range from [0, 1]
width (float): the width of board
width (float): the width of board
...
@@ -288,6 +360,6 @@ class Doc(PageableData):
...
@@ -288,6 +360,6 @@ class Doc(PageableData):
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
content (str): the text content
content (str): the text content
fontsize (int): font size of the text
fontsize (int): font size of the text
color (list[float] | None): three element tuple which descri
pt
the RGB of the board line, None will use the default font color!
color (list[float] | None): three element tuple which descri
be
the RGB of the board line, None will use the default font color!
"""
"""
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
self
.
_doc
.
insert_text
(
coord
,
content
,
fontsize
=
fontsize
,
color
=
color
)
magic_pdf/libs/draw_bbox.py
View file @
f6bd47de
...
@@ -3,7 +3,6 @@ from magic_pdf.config.constants import CROSS_PAGE
...
@@ -3,7 +3,6 @@ from magic_pdf.config.constants import CROSS_PAGE
from
magic_pdf.config.ocr_content_type
import
(
BlockType
,
CategoryId
,
from
magic_pdf.config.ocr_content_type
import
(
BlockType
,
CategoryId
,
ContentType
)
ContentType
)
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.commons
import
fitz
# PyMuPDF
from
magic_pdf.model.magic_model
import
MagicModel
from
magic_pdf.model.magic_model
import
MagicModel
...
...
magic_pdf/pdf_parse_union_core_v2.py
View file @
f6bd47de
...
@@ -89,10 +89,7 @@ def chars_to_content(span):
...
@@ -89,10 +89,7 @@ def chars_to_content(span):
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
LINE_STOP_FLAG
=
(
'.'
,
'!'
,
'?'
,
'。'
,
'!'
,
'?'
,
')'
,
')'
,
'"'
,
'”'
,
':'
,
':'
,
';'
,
';'
,
']'
,
'】'
,
'}'
,
'}'
,
'>'
,
'》'
,
'、'
,
','
,
','
,
'-'
,
'—'
,
'–'
,)
<<<<<<<
HEAD
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
LINE_START_FLAG
=
(
'('
,
'('
,
'"'
,
'“'
,
'【'
,
'{'
,
'《'
,
'<'
,
'「'
,
'『'
,
'【'
,
'['
,)
=======
>>>>>>>
731
f4bf
(
feat
:
add
function
definitions
)
def
fill_char_in_spans
(
spans
,
all_chars
):
def
fill_char_in_spans
(
spans
,
all_chars
):
...
...
magic_pdf/pipe/operators.py
View file @
f6bd47de
import
json
import
json
import
os
import
os
from
typing
import
Callable
import
copy
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.data_reader_writer
import
DataWriter
...
@@ -122,3 +124,15 @@ class PipeResult:
...
@@ -122,3 +124,15 @@ class PipeResult:
str: compress the pipeline result and return
str: compress the pipeline result and return
"""
"""
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
return
JsonCompressor
.
compress_json
(
self
.
pdf_mid_data
)
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(pipeline_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
return
proc
(
copy
.
deepcopy
(
self
.
_pipe_res
),
*
args
,
**
kwargs
)
next_docs/en/user_guide/quick_start/to_markdown.rst
View file @
f6bd47de
...
@@ -12,17 +12,17 @@ Local File Example
...
@@ -12,17 +12,17 @@ Local File Example
import os
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.
config.make_content_config import DropMode, MakeMode
from magic_pdf.
data.dataset import PymuDocDataset
from magic_pdf.
pipe.OCRPipe import OCRPip
e
from magic_pdf.
model.doc_analyze_by_custom_model import doc_analyz
e
# args
## args
model_list = []
pdf_file_name = "abc.pdf" # replace with the real pdf path
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
## prepare env
local_image_dir, local_md_dir = "output/images", "output"
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
...
@@ -30,27 +30,31 @@ Local File Example
...
@@ -30,27 +30,31 @@ Local File Example
)
)
image_dir = str(os.path.basename(local_image_dir))
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name)
# read the pdf content
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
### draw model result on each page
pipe.pipe_analyze()
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown(
### draw spans result on each page
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
)
if isinstance(md_content, list):
### dump markdown
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
S3 File Example
S3 File Example
...
@@ -61,8 +65,8 @@ S3 File Example
...
@@ -61,8 +65,8 @@ S3 File Example
import os
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.
config.make_content_config import DropMode, MakeMode
from magic_pdf.
data.dataset import PymuDocDataset
from magic_pdf.
pipe.OCRPipe import OCRPip
e
from magic_pdf.
model.doc_analyze_by_custom_model import doc_analyz
e
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
ak = "{Your S3 access key}" # replace with real s3 access key
...
@@ -74,29 +78,39 @@ S3 File Example
...
@@ -74,29 +78,39 @@ S3 File Example
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
# args
model_list = []
pdf_file_name = (
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
### draw model result on each page
pipe.pipe_analyze()
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown(
### draw layout result on each page
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
)
### draw spans result on each page
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
if isinstance(md_content, list):
### dump markdown
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
Check :doc:`../data/data_reader_writer` for more [reader | writer] examples
and check :doc:`../../api/pipe_operators` or :doc:`../../api/model_operators` for api details
next_docs/zh_cn/user_guide/quick_start/to_markdown.rst
View file @
f6bd47de
转换为 Markdown 文件
转换为 Markdown 文件
========================
========================
本地文件示例
本地文件示例
^^^^^^^^^^^
^^^^^^^^^^^
^^^^^^^
.. code:: python
.. code:: python
import os
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
## args
# args
model_list = []
pdf_file_name = "abc.pdf" # replace with the real pdf path
pdf_file_name = "abc.pdf" # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]
# prepare env
## prepare env
local_image_dir, local_md_dir = "output/images", "output"
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
...
@@ -30,39 +28,43 @@
...
@@ -30,39 +28,43 @@
)
)
image_dir = str(os.path.basename(local_image_dir))
image_dir = str(os.path.basename(local_image_dir))
# read bytes
reader1 = FileBasedDataReader("")
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name)
# read the pdf content
pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
### draw model result on each page
pipe.pipe_analyze()
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))
md_content = pipe.pipe_mk_markdown(
### draw spans result on each page
image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))
)
if isinstance(md_content, list):
### dump markdown
md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)
else:
md_writer.write_string(f"{pdf_file_name}.md", md_content)
对象存储
使用
示例
对象存储
文件
示例
^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^
^
.. code:: python
.. code:: python
import os
import os
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.data.data_reader_writer import S3DataReader, S3DataWriter
from magic_pdf.
config.make_content_config import DropMode, MakeMode
from magic_pdf.
data.dataset import PymuDocDataset
from magic_pdf.
pipe.OCRPipe import OCRPip
e
from magic_pdf.
model.doc_analyze_by_custom_model import doc_analyz
e
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
bucket_name = "{Your S3 Bucket Name}" # replace with real bucket name
ak = "{Your S3 access key}" # replace with real s3 access key
ak = "{Your S3 access key}" # replace with real s3 access key
...
@@ -74,30 +76,39 @@
...
@@ -74,30 +76,39 @@
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
writer = S3DataWriter('unittest/tmp', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
image_writer = S3DataWriter('unittest/tmp/images', bucket_name, ak, sk, endpoint_url)
## args
# args
model_list = []
pdf_file_name = (
pdf_file_name = f"s3://{bucket_name}/{fake pdf path}" # replace with the real s3 path
"s3://llm-pdf-text-1/unittest/tmp/bug5-11.pdf" # replace with the real s3 path
)
# prepare env
local_dir = "output"
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
# read bytes
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
pdf_bytes = reader.read(pdf_file_name) # read the pdf content
# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
## inference
infer_result = ds.apply(doc_analyze, ocr=True)
pipe.pipe_classify()
### draw model result on each page
pipe.pipe_analyze()
infer_result.draw_model(os.path.join(local_dir, f'{name_without_suff}_model.pdf')) # dump to local
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
## pipeline
pipe_result = infer_result.pipe_ocr_mode(image_writer)
md_content = pipe.pipe_mk_markdown(
### draw layout result on each page
"unittest/tmp/images", drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD
pipe_result.draw_layout(os.path.join(local_dir, f'{name_without_suff}_layout.pdf')) # dump to local
)
if isinstance(md_content, list):
### draw spans result on each page
writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content))
pipe_result.draw_span(os.path.join(local_dir, f'{name_without_suff}_spans.pdf')) # dump to local
else:
writer.write_string(f"{pdf_file_name}.md", md_content)
### dump markdown
pipe_result.dump_md(writer, f'{name_without_suff}.md', "unittest/tmp/images") # dump to remote s3
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
前去 :doc:`../data/data_reader_writer` 获取更多有关 **读写** 示例
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment