Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d44e7a28
Commit
d44e7a28
authored
Nov 29, 2024
by
xu rui
Browse files
refactor: add docs
parent
4a82d6a0
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
153 additions
and
9 deletions
+153
-9
magic_pdf/model/__init__.py
magic_pdf/model/__init__.py
+124
-0
magic_pdf/model/operators.py
magic_pdf/model/operators.py
+9
-8
next_docs/en/api.rst
next_docs/en/api.rst
+2
-0
next_docs/en/api/model_operators.rst
next_docs/en/api/model_operators.rst
+8
-0
next_docs/en/api/pipe_operators.rst
next_docs/en/api/pipe_operators.rst
+9
-0
next_docs/en/conf.py
next_docs/en/conf.py
+1
-1
No files found.
magic_pdf/model/__init__.py
View file @
d44e7a28
from
typing
import
Callable
from
abc
import
ABC
,
abstractmethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.pipe.operators
import
PipeResult
__use_inside_model__
=
True
__model_mode__
=
"full"
class
InferenceResultBase
(
ABC
):
@
abstractmethod
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
"""Initialized method.
Args:
inference_results (list): the inference result generated by model
dataset (Dataset): the dataset related with model inference result
"""
self
.
_infer_res
=
inference_results
self
.
_dataset
=
dataset
@
abstractmethod
def
draw_model
(
self
,
file_path
:
str
)
->
None
:
"""Draw model inference result.
Args:
file_path (str): the output file path
"""
pass
@
abstractmethod
def
dump_model
(
self
,
writer
:
DataWriter
,
file_path
:
str
):
"""Dump model inference result to file.
Args:
writer (DataWriter): writer handle
file_path (str): the location of target file
"""
pass
@
abstractmethod
def
get_infer_res
(
self
):
"""Get the inference result.
Returns:
list: the inference result generated by model
"""
pass
@
abstractmethod
def
apply
(
self
,
proc
:
Callable
,
*
args
,
**
kwargs
):
"""Apply callable method which.
Args:
proc (Callable): invoke proc as follows:
proc(inference_result, *args, **kwargs)
Returns:
Any: return the result generated by proc
"""
pass
@
abstractmethod
def
pipe_auto_mode
(
self
,
imageWriter
:
DataWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
->
PipeResult
:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pass
@
abstractmethod
def
pipe_txt_mode
(
self
,
imageWriter
:
DataWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
->
PipeResult
:
"""Post-proc the model inference result, Extract the text using the
third library, such as `pymupdf`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pass
@
abstractmethod
def
pipe_ocr_mode
(
self
,
imageWriter
:
DataWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
->
PipeResult
:
pass
magic_pdf/model/operators.py
View file @
d44e7a28
...
...
@@ -10,9 +10,10 @@ from magic_pdf.filter import classify
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pipe.operators
import
PipeResult
from
magic_pdf.model
import
InferenceResultBase
class
InferenceResult
:
class
InferenceResult
(
InferenceResultBase
)
:
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
"""Initialized method.
...
...
@@ -52,7 +53,7 @@ class InferenceResult:
"""Get the inference result.
Returns:
list
[dict]
: the inference result generated by model
list: the inference result generated by model
"""
return
self
.
_infer_res
...
...
@@ -83,9 +84,9 @@ class InferenceResult:
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (
_type_
, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
end_page_id (
int
, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (
_type_
, optional): Defaults to None.
lang (
str
, optional): Defaults to None.
Returns:
PipeResult: the result
...
...
@@ -116,9 +117,9 @@ class InferenceResult:
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (
_type_
, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
end_page_id (
int
, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (
_type_
, optional): Defaults to None.
lang (
str
, optional): Defaults to None.
Returns:
PipeResult: the result
...
...
@@ -153,9 +154,9 @@ class InferenceResult:
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (
_type_
, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
end_page_id (
int
, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (
_type_
, optional): Defaults to None.
lang (
str
, optional): Defaults to None.
Returns:
PipeResult: the result
...
...
next_docs/en/api.rst
View file @
d44e7a28
...
...
@@ -7,3 +7,5 @@
api/read_api
api/schemas
api/io
api/pipe_operators
api/model_operators
\ No newline at end of file
next_docs/en/api/model_operators.rst
0 → 100644
View file @
d44e7a28
Model Api
==========
.. autoclass:: magic_pdf.model.InferenceResultBase
:members:
:inherited-members:
:show-inheritance:
next_docs/en/api/pipe_operators.rst
0 → 100644
View file @
d44e7a28
Pipeline Api
=============
.. autoclass:: magic_pdf.pipe.operators.PipeResult
:members:
:inherited-members:
:show-inheritance:
\ No newline at end of file
next_docs/en/conf.py
View file @
d44e7a28
...
...
@@ -114,7 +114,7 @@ autodoc_mock_imports = [
'sentencepiece'
,
'vllm.cuda_utils'
,
'vllm._C'
,
'numpy'
,
#
'numpy',
'tqdm'
,
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment