Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
87af738a
Commit
87af738a
authored
Dec 07, 2024
by
sawmice
Browse files
fix: 1. ocr txt mode error 2. lose pdf_parse_type field
parent
fa113b57
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
20 additions
and
6 deletions
+20
-6
magic_pdf/config/constants.py
magic_pdf/config/constants.py
+5
-0
magic_pdf/model/operators.py
magic_pdf/model/operators.py
+14
-3
magic_pdf/user_api.py
magic_pdf/user_api.py
+1
-3
No files found.
magic_pdf/config/constants.py
View file @
87af738a
...
...
@@ -51,3 +51,8 @@ class MODEL_NAME:
UniMerNet_v2_Small
=
'unimernet_small'
RAPID_TABLE
=
'rapid_table'
PARSE_TYPE_TXT
=
'txt'
PARSE_TYPE_OCR
=
'ocr'
magic_pdf/model/operators.py
View file @
87af738a
...
...
@@ -6,12 +6,14 @@ from typing import Callable
from
magic_pdf.config.enums
import
SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
DataWriter
from
magic_pdf.data.dataset
import
Dataset
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.filter
import
classify
from
magic_pdf.libs.draw_bbox
import
draw_model_bbox
from
magic_pdf.pdf_parse_union_core_v2
import
pdf_parse_union
from
magic_pdf.pipe.operators
import
PipeResult
from
magic_pdf.model
import
InferenceResultBase
from
magic_pdf.libs.version
import
__version__
from
magic_pdf.config.constants
import
PARSE_TYPE_TXT
,
PARSE_TYPE_OCR
class
InferenceResult
(
InferenceResultBase
):
def
__init__
(
self
,
inference_results
:
list
,
dataset
:
Dataset
):
...
...
@@ -129,7 +131,7 @@ class InferenceResult(InferenceResultBase):
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
return
PipeResult
(
res
,
self
.
_dataset
)
re
turn
self
.
apply
(
re
s
=
self
.
apply
(
proc
,
self
.
_dataset
,
imageWriter
,
...
...
@@ -139,6 +141,11 @@ class InferenceResult(InferenceResultBase):
debug_mode
=
debug_mode
,
lang
=
lang
,
)
res
[
'_parse_type'
]
=
PARSE_TYPE_TXT
res
[
'_version_name'
]
=
__version__
return
res
def
pipe_ocr_mode
(
self
,
...
...
@@ -166,7 +173,7 @@ class InferenceResult(InferenceResultBase):
res
=
pdf_parse_union
(
*
args
,
**
kwargs
)
return
PipeResult
(
res
,
self
.
_dataset
)
re
turn
self
.
apply
(
re
s
=
self
.
apply
(
proc
,
self
.
_dataset
,
imageWriter
,
...
...
@@ -176,3 +183,7 @@ class InferenceResult(InferenceResultBase):
debug_mode
=
debug_mode
,
lang
=
lang
,
)
res
[
'_parse_type'
]
=
PARSE_TYPE_OCR
res
[
'_version_name'
]
=
__version__
return
res
\ No newline at end of file
magic_pdf/user_api.py
View file @
87af738a
...
...
@@ -15,9 +15,7 @@ from magic_pdf.libs.version import __version__
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.pdf_parse_by_ocr
import
parse_pdf_by_ocr
from
magic_pdf.pdf_parse_by_txt
import
parse_pdf_by_txt
PARSE_TYPE_TXT
=
'txt'
PARSE_TYPE_OCR
=
'ocr'
from
magic_pdf.config.constants
import
PARSE_TYPE_TXT
,
PARSE_TYPE_OCR
def
parse_txt_pdf
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment