Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
b492c19c
Commit
b492c19c
authored
Nov 19, 2024
by
icecraft
Browse files
refactor: move some constants or enums defs to config folder
parent
bc992433
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
18 deletions
+16
-18
magic_pdf/spark/spark_api.py
magic_pdf/spark/spark_api.py
+15
-17
magic_pdf/tools/common.py
magic_pdf/tools/common.py
+1
-1
No files found.
magic_pdf/spark/spark_api.py
View file @
b492c19c
from
loguru
import
logger
from
magic_pdf.
libs
.drop_reason
import
DropReason
from
magic_pdf.
config
.drop_reason
import
DropReason
def
get_data_source
(
jso
:
dict
):
data_source
=
jso
.
get
(
"
data_source
"
)
data_source
=
jso
.
get
(
'
data_source
'
)
if
data_source
is
None
:
data_source
=
jso
.
get
(
"
file_source
"
)
data_source
=
jso
.
get
(
'
file_source
'
)
return
data_source
def
get_data_type
(
jso
:
dict
):
data_type
=
jso
.
get
(
"
data_type
"
)
data_type
=
jso
.
get
(
'
data_type
'
)
if
data_type
is
None
:
data_type
=
jso
.
get
(
"
file_type
"
)
data_type
=
jso
.
get
(
'
file_type
'
)
return
data_type
def
get_bookid
(
jso
:
dict
):
book_id
=
jso
.
get
(
"
bookid
"
)
book_id
=
jso
.
get
(
'
bookid
'
)
if
book_id
is
None
:
book_id
=
jso
.
get
(
"
original_file_id
"
)
book_id
=
jso
.
get
(
'
original_file_id
'
)
return
book_id
def
exception_handler
(
jso
:
dict
,
e
):
logger
.
exception
(
e
)
jso
[
"
_need_drop
"
]
=
True
jso
[
"
_drop_reason
"
]
=
DropReason
.
Exception
jso
[
"
_exception
"
]
=
f
"
ERROR:
{
e
}
"
jso
[
'
_need_drop
'
]
=
True
jso
[
'
_drop_reason
'
]
=
DropReason
.
Exception
jso
[
'
_exception
'
]
=
f
'
ERROR:
{
e
}
'
return
jso
def
get_bookname
(
jso
:
dict
):
data_source
=
get_data_source
(
jso
)
file_id
=
jso
.
get
(
"
file_id
"
)
book_name
=
f
"
{
data_source
}
/
{
file_id
}
"
file_id
=
jso
.
get
(
'
file_id
'
)
book_name
=
f
'
{
data_source
}
/
{
file_id
}
'
return
book_name
def
spark_json_extractor
(
jso
:
dict
)
->
dict
:
"""
从json中提取数据,返回一个dict
"""
"""从json中提取数据,返回一个dict."""
return
{
"
_pdf_type
"
:
jso
[
"
_pdf_type
"
],
"
model_list
"
:
jso
[
"
doc_layout_result
"
],
'
_pdf_type
'
:
jso
[
'
_pdf_type
'
],
'
model_list
'
:
jso
[
'
doc_layout_result
'
],
}
magic_pdf/tools/common.py
View file @
b492c19c
...
...
@@ -7,10 +7,10 @@ import fitz
from
loguru
import
logger
import
magic_pdf.model
as
model_config
from
magic_pdf.config.make_content_config
import
DropMode
,
MakeMode
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.libs.draw_bbox
import
(
draw_layout_bbox
,
draw_line_sort_bbox
,
draw_model_bbox
,
draw_span_bbox
)
from
magic_pdf.libs.MakeContentConfig
import
DropMode
,
MakeMode
from
magic_pdf.pipe.OCRPipe
import
OCRPipe
from
magic_pdf.pipe.TXTPipe
import
TXTPipe
from
magic_pdf.pipe.UNIPipe
import
UNIPipe
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment