Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
57f9f9dc
Commit
57f9f9dc
authored
Dec 09, 2024
by
icecraft
Browse files
fix: add parse_pdf_type and version
parent
8f266869
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
13 deletions
+14
-13
magic_pdf/model/operators.py
magic_pdf/model/operators.py
+14
-13
No files found.
magic_pdf/model/operators.py
View file @
57f9f9dc
...
@@ -3,17 +3,17 @@ import json
...
@@ -3,17 +3,17 @@ import json
import
os
import
os
from
typing
import
Callable
from
typing
import
Callable
from
magic_pdf.config.constants
import
PARSE_TYPE_OCR
,
PARSE_TYPE_TXT
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.filter
import
classify
from
magic_pdf.filter
import
classify
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.model
import
InferenceResultBase
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pipe.operators
import
PipeResult
from
magic_pdf.pipe.operators
import
PipeResult
from
magic_pdf.model
import
InferenceResultBase
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.config.constants
import
PARSE_TYPE_TXT
,
PARSE_TYPE_OCR
class
InferenceResult
(
InferenceResultBase
):
class
InferenceResult
(
InferenceResultBase
):
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
...
@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
...
@@ -129,6 +129,10 @@ class InferenceResult(InferenceResultBase):
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
[
'_parse_type'
]
=
PARSE_TYPE_TXT
res
[
'_version_name'
]
=
__version__
if
'lang'
in
kwargs
and
kwargs
[
'lang'
]
is
not
None
:
res
[
'lang'
]
=
kwargs
[
'lang'
]
return
PipeResult
(
res
,
self
.
_dataset
)
return
PipeResult
(
res
,
self
.
_dataset
)
res
=
self
.
apply
(
res
=
self
.
apply
(
...
@@ -141,12 +145,8 @@ class InferenceResult(InferenceResultBase):
...
@@ -141,12 +145,8 @@ class InferenceResult(InferenceResultBase):
debug_mode
=
debug_mode
,
debug_mode
=
debug_mode
,
lang
=
lang
,
lang
=
lang
,
)
)
res
[
'_parse_type'
]
=
PARSE_TYPE_TXT
res
[
'_version_name'
]
=
__version__
return
res
return
res
def
pipe_ocr_mode
(
def
pipe_ocr_mode
(
self
,
self
,
imageWriter
:
DataWriter
,
imageWriter
:
DataWriter
,
...
@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
...
@@ -171,19 +171,20 @@ class InferenceResult(InferenceResultBase):
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
def
proc
(
*
args
,
**
kwargs
)
->
PipeResult
:
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
res
[
'_parse_type'
]
=
PARSE_TYPE_OCR
res
[
'_version_name'
]
=
__version__
if
'lang'
in
kwargs
and
kwargs
[
'lang'
]
is
not
None
:
res
[
'lang'
]
=
kwargs
[
'lang'
]
return
PipeResult
(
res
,
self
.
_dataset
)
return
PipeResult
(
res
,
self
.
_dataset
)
res
=
self
.
apply
(
res
=
self
.
apply
(
proc
,
proc
,
self
.
_dataset
,
self
.
_dataset
,
imageWriter
,
imageWriter
,
SupportedPdfParseMethod
.
TXT
,
SupportedPdfParseMethod
.
OCR
,
start_page_id
=
start_page_id
,
start_page_id
=
start_page_id
,
end_page_id
=
end_page_id
,
end_page_id
=
end_page_id
,
debug_mode
=
debug_mode
,
debug_mode
=
debug_mode
,
lang
=
lang
,
lang
=
lang
,
)
)
res
[
'_parse_type'
]
=
PARSE_TYPE_OCR
res
[
'_version_name'
]
=
__version__
return
res
return
res
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment