Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
8a0aa7a4
Unverified
Commit
8a0aa7a4
authored
Jan 06, 2025
by
Xiaomeng Zhao
Committed by
GitHub
Jan 06, 2025
Browse files
Merge branch 'dev' into dev
parents
2e1bf881
ad9abc32
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
8 deletions
+11
-8
magic_pdf/data/data_reader_writer/multi_bucket_s3.py
magic_pdf/data/data_reader_writer/multi_bucket_s3.py
+8
-6
magic_pdf/pdf_parse_union_core_v2.py
magic_pdf/pdf_parse_union_core_v2.py
+2
-1
projects/web_api/app.py
projects/web_api/app.py
+1
-1
No files found.
magic_pdf/data/data_reader_writer/multi_bucket_s3.py
View file @
8a0aa7a4
import
os
from
magic_pdf.config.exceptions
import
InvalidConfig
,
InvalidParams
from
magic_pdf.config.exceptions
import
InvalidConfig
,
InvalidParams
from
magic_pdf.data.data_reader_writer.base
import
DataReader
,
DataWriter
from
magic_pdf.data.data_reader_writer.base
import
DataReader
,
DataWriter
from
magic_pdf.data.io.s3
import
S3Reader
,
S3Writer
from
magic_pdf.data.io.s3
import
S3Reader
,
S3Writer
...
@@ -22,10 +22,10 @@ class MultiS3Mixin:
...
@@ -22,10 +22,10 @@ class MultiS3Mixin:
"""
"""
if
len
(
default_prefix
)
==
0
:
if
len
(
default_prefix
)
==
0
:
raise
InvalidConfig
(
'default_prefix must be provided'
)
raise
InvalidConfig
(
'default_prefix must be provided'
)
arr
=
default_prefix
.
strip
(
"/"
).
split
(
"/"
)
arr
=
default_prefix
.
strip
(
'/'
).
split
(
'/'
)
self
.
default_bucket
=
arr
[
0
]
self
.
default_bucket
=
arr
[
0
]
self
.
default_prefix
=
"/"
.
join
(
arr
[
1
:])
self
.
default_prefix
=
'/'
.
join
(
arr
[
1
:])
found_default_bucket_config
=
False
found_default_bucket_config
=
False
for
conf
in
s3_configs
:
for
conf
in
s3_configs
:
...
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
...
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
s3_reader
=
self
.
__get_s3_client
(
bucket_name
)
s3_reader
=
self
.
__get_s3_client
(
bucket_name
)
else
:
else
:
s3_reader
=
self
.
__get_s3_client
(
self
.
default_bucket
)
s3_reader
=
self
.
__get_s3_client
(
self
.
default_bucket
)
path
=
os
.
path
.
join
(
self
.
default_prefix
,
path
)
if
self
.
default_prefix
:
path
=
self
.
default_prefix
+
'/'
+
path
return
s3_reader
.
read_at
(
path
,
offset
,
limit
)
return
s3_reader
.
read_at
(
path
,
offset
,
limit
)
...
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
...
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
s3_writer
=
self
.
__get_s3_client
(
bucket_name
)
s3_writer
=
self
.
__get_s3_client
(
bucket_name
)
else
:
else
:
s3_writer
=
self
.
__get_s3_client
(
self
.
default_bucket
)
s3_writer
=
self
.
__get_s3_client
(
self
.
default_bucket
)
path
=
os
.
path
.
join
(
self
.
default_prefix
,
path
)
if
self
.
default_prefix
:
path
=
self
.
default_prefix
+
'/'
+
path
return
s3_writer
.
write
(
path
,
data
)
return
s3_writer
.
write
(
path
,
data
)
magic_pdf/pdf_parse_union_core_v2.py
View file @
8a0aa7a4
...
@@ -91,7 +91,8 @@ def chars_to_content(span):
...
@@ -91,7 +91,8 @@ def chars_to_content(span):
content
=
''
content
=
''
for
char
in
span
[
'chars'
]:
for
char
in
span
[
'chars'
]:
# 如果下一个char的x0和上一个char的x1距离超过一个字符宽度,则需要在中间插入一个空格
# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1
=
char
char1
=
char
char2
=
span
[
'chars'
][
span
[
'chars'
].
index
(
char
)
+
1
]
if
span
[
'chars'
].
index
(
char
)
+
1
<
len
(
span
[
'chars'
])
else
None
char2
=
span
[
'chars'
][
span
[
'chars'
].
index
(
char
)
+
1
]
if
span
[
'chars'
].
index
(
char
)
+
1
<
len
(
span
[
'chars'
])
else
None
if
char2
and
char2
[
'bbox'
][
0
]
-
char1
[
'bbox'
][
2
]
>
char_avg_width
*
0.25
and
char
[
'c'
]
!=
' '
and
char2
[
'c'
]
!=
' '
:
if
char2
and
char2
[
'bbox'
][
0
]
-
char1
[
'bbox'
][
2
]
>
char_avg_width
*
0.25
and
char
[
'c'
]
!=
' '
and
char2
[
'c'
]
!=
' '
:
...
...
projects/web_api/app.py
View file @
8a0aa7a4
...
@@ -13,7 +13,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
...
@@ -13,7 +13,7 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.data_reader_writer
import
FileBasedDataWriter
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.data.dataset
import
PymuDocDataset
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.model.doc_analyze_by_custom_model
import
doc_analyze
from
magic_pdf.
model.
operators
import
InferenceResult
from
magic_pdf.operators
.models
import
InferenceResult
model_config
.
__use_inside_model__
=
True
model_config
.
__use_inside_model__
=
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment