Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
302a6950
Commit
302a6950
authored
Dec 11, 2024
by
xu rui
Browse files
feat: remove pipe_auto_mode
parent
3062217d
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
62 deletions
+10
-62
magic_pdf/model/__init__.py
magic_pdf/model/__init__.py
+0
-25
magic_pdf/model/operators.py
magic_pdf/model/operators.py
+0
-34
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+10
-3
No files found.
magic_pdf/model/__init__.py
View file @
302a6950
...
@@ -65,31 +65,6 @@ class InferenceResultBase(ABC):
...
@@ -65,31 +65,6 @@ class InferenceResultBase(ABC):
"""
"""
pass
pass
@
abstractmethod
def
pipe_auto_mode
(
self
,
imageWriter
:
DataWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
->
PipeResult
:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pass
@
abstractmethod
@
abstractmethod
def
pipe_txt_mode
(
def
pipe_txt_mode
(
self
,
self
,
...
...
magic_pdf/model/operators.py
View file @
302a6950
...
@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
...
@@ -71,40 +71,6 @@ class InferenceResult(InferenceResultBase):
"""
"""
return
proc
(
copy
.
deepcopy
(
self
.
_infer_res
),
*
args
,
**
kwargs
)
return
proc
(
copy
.
deepcopy
(
self
.
_infer_res
),
*
args
,
**
kwargs
)
def
pipe_auto_mode
(
self
,
imageWriter
:
DataWriter
,
start_page_id
=
0
,
end_page_id
=
None
,
debug_mode
=
False
,
lang
=
None
,
)
->
PipeResult
:
"""Post-proc the model inference result.
step1: classify the dataset type
step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
Args:
imageWriter (DataWriter): the image writer handle
start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
end_page_id (int, optional): Defaults to the last page index of dataset. Let user select some pages He/She want to process
debug_mode (bool, optional): Defaults to False. will dump more log if enabled
lang (str, optional): Defaults to None.
Returns:
PipeResult: the result
"""
pdf_proc_method
=
classify
(
self
.
_dataset
.
data_bits
())
if
pdf_proc_method
==
SupportedPdfParseMethod
.
TXT
:
return
self
.
pipe_txt_mode
(
imageWriter
,
start_page_id
,
end_page_id
,
debug_mode
,
lang
)
else
:
return
self
.
pipe_ocr_mode
(
imageWriter
,
start_page_id
,
end_page_id
,
debug_mode
,
lang
)
def
pipe_txt_mode
(
def
pipe_txt_mode
(
self
,
self
,
imageWriter
:
DataWriter
,
imageWriter
:
DataWriter
,
...
...
magic_pdf/tools/common.py
View file @
302a6950
...
@@ -170,6 +170,7 @@ def do_parse(
...
@@ -170,6 +170,7 @@ def do_parse(
logger
.
error
(
'need model list input'
)
logger
.
error
(
'need model list input'
)
exit
(
2
)
exit
(
2
)
else
:
else
:
infer_result
=
InferenceResult
(
model_list
,
ds
)
infer_result
=
InferenceResult
(
model_list
,
ds
)
if
parse_method
==
'ocr'
:
if
parse_method
==
'ocr'
:
pipe_result
=
infer_result
.
pipe_ocr_mode
(
pipe_result
=
infer_result
.
pipe_ocr_mode
(
...
@@ -180,10 +181,16 @@ def do_parse(
...
@@ -180,10 +181,16 @@ def do_parse(
image_writer
,
debug_mode
=
True
,
lang
=
lang
image_writer
,
debug_mode
=
True
,
lang
=
lang
)
)
else
:
else
:
pipe_result
=
infer_result
.
pipe_auto_mode
(
if
ds
.
classify
()
==
SupportedPdfParseMethod
.
TXT
:
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
lang
)
else
:
pipe_result
=
infer_result
.
pipe_txt_mode
(
image_writer
,
debug_mode
=
True
,
lang
=
lang
image_writer
,
debug_mode
=
True
,
lang
=
lang
)
)
if
f_draw_model_bbox
:
if
f_draw_model_bbox
:
infer_result
.
draw_model
(
infer_result
.
draw_model
(
os
.
path
.
join
(
local_md_dir
,
f
'
{
pdf_file_name
}
_model.pdf'
)
os
.
path
.
join
(
local_md_dir
,
f
'
{
pdf_file_name
}
_model.pdf'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment