Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
c5a4150e
Unverified
Commit
c5a4150e
authored
Dec 09, 2024
by
Xiaomeng Zhao
Committed by
GitHub
Dec 09, 2024
Browse files
Merge pull request #1228 from icecraft/fix/pipe_result
fix: add parse_pdf_type and version
parents
8f266869
57f9f9dc
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
13 deletions
+14
-13
magic_pdf/model/operators.py
magic_pdf/model/operators.py
+14
-13
No files found.
magic_pdf/model/operators.py
View file @
c5a4150e
...
@@ -3,17 +3,17 @@ import json
...
@@ -3,17 +3,17 @@ import json
import
os
import
os
from
typing
import
Callable
from
typing
import
Callable
from
magic_pdf.config.constants
import
PARSE_TYPE_OCR
,
PARSE_TYPE_TXT
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.filter
import
classify
from
magic_pdf.filter
import
classify
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.model
import
InferenceResultBase
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pipe.operators
import
PipeResult
from
magic_pdf.pipe.operators
import
PipeResult
from
magic_pdf.model
import
InferenceResultBase
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.config.constants
import
PARSE_TYPE_TXT
,
PARSE_TYPE_OCR
class
InferenceResult
(
InferenceResultBase
):
class
InferenceResult
(
InferenceResultBase
):
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
...
@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
...
@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
[
'_parse_type'
]
=
PARSE_TYPE_TXT
res
[
'_version_name'
]
=
__version__
if
'lang'
in
kwargs
and
kwargs
[
'lang'
]
is
not
None
:
res
[
'lang'
]
=
kwargs
[
'lang'
]
return
PipeResult
(
res
,
self
.
_dataset
)
return
PipeResult
(
res
,
self
.
_dataset
)
res
=
self
.
apply
(
res
=
self
.
apply
(
...
@@ -141,12 +145,8 @@ class InferenceResult(InferenceResultBase):
...
@@ -141,12 +145,8 @@ class InferenceResult(InferenceResultBase):
debug_mode
=
debug_mode
,
debug_mode
=
debug_mode
,
lang
=
lang
,
lang
=
lang
,
)
)
res
[
'_parse_type'
]
=
PARSE_TYPE_TXT
res
[
'_version_name'
]
=
__version__
return
res
return
res
def
pipe_ocr_mode
(
def
pipe_ocr_mode
(
self
,
self
,
imageWriter
:
DataWriter
,
imageWriter
:
DataWriter
,
...
@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
...
@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
[
'_parse_type'
]
=
PARSE_TYPE_OCR
res
[
'_version_name'
]
=
__version__
if
'lang'
in
kwargs
and
kwargs
[
'lang'
]
is
not
None
:
res
[
'lang'
]
=
kwargs
[
'lang'
]
return
PipeResult
(
res
,
self
.
_dataset
)
return
PipeResult
(
res
,
self
.
_dataset
)
res
=
self
.
apply
(
res
=
self
.
apply
(
proc
,
proc
,
self
.
_dataset
,
self
.
_dataset
,
imageWriter
,
imageWriter
,
SupportedPdfParseMethod
.
TXT
,
SupportedPdfParseMethod
.
OCR
,
start_page_id
=
start_page_id
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
end_page_id
=
end_page_id
,
debug_mode
=
debug_mode
,
debug_mode
=
debug_mode
,
lang
=
lang
,
lang
=
lang
,
)
)
res
[
'_parse_type'
]
=
PARSE_TYPE_OCR
res
[
'_version_name'
]
=
__version__
return
res
return
res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment