Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
d5dbed73
Commit
d5dbed73
authored
Mar 01, 2024
by
赵小蒙
Browse files
目录重构
parent
7c7910e4
Changes
85
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
23 additions
and
23 deletions
+23
-23
magic_pdf/libs/drop_tag.py
magic_pdf/libs/drop_tag.py
+0
-0
magic_pdf/libs/json_compressor.py
magic_pdf/libs/json_compressor.py
+0
-0
magic_pdf/libs/language.py
magic_pdf/libs/language.py
+0
-0
magic_pdf/libs/markdown_utils.py
magic_pdf/libs/markdown_utils.py
+0
-0
magic_pdf/libs/nlp_utils.py
magic_pdf/libs/nlp_utils.py
+1
-1
magic_pdf/libs/pdf_image_tools.py
magic_pdf/libs/pdf_image_tools.py
+2
-2
magic_pdf/libs/safe_filename.py
magic_pdf/libs/safe_filename.py
+0
-0
magic_pdf/libs/textbase.py
magic_pdf/libs/textbase.py
+0
-0
magic_pdf/libs/vis_utils.py
magic_pdf/libs/vis_utils.py
+1
-1
magic_pdf/para/__init__.py
magic_pdf/para/__init__.py
+0
-0
magic_pdf/para/block_continuation_processor.py
magic_pdf/para/block_continuation_processor.py
+1
-1
magic_pdf/para/block_termination_processor.py
magic_pdf/para/block_termination_processor.py
+1
-1
magic_pdf/para/commons.py
magic_pdf/para/commons.py
+1
-1
magic_pdf/para/denoise.py
magic_pdf/para/denoise.py
+1
-1
magic_pdf/para/draw.py
magic_pdf/para/draw.py
+2
-2
magic_pdf/para/exceptions.py
magic_pdf/para/exceptions.py
+0
-0
magic_pdf/para/layout_match_processor.py
magic_pdf/para/layout_match_processor.py
+1
-1
magic_pdf/para/para_pipeline.py
magic_pdf/para/para_pipeline.py
+11
-11
magic_pdf/para/raw_processor.py
magic_pdf/para/raw_processor.py
+0
-0
magic_pdf/para/stats.py
magic_pdf/para/stats.py
+1
-1
No files found.
pdf_tools
/libs/drop_tag.py
→
magic_pdf
/libs/drop_tag.py
View file @
d5dbed73
File moved
pdf_tools
/libs/json_compressor.py
→
magic_pdf
/libs/json_compressor.py
View file @
d5dbed73
File moved
pdf_tools
/libs/language.py
→
magic_pdf
/libs/language.py
View file @
d5dbed73
File moved
pdf_tools
/libs/markdown_utils.py
→
magic_pdf
/libs/markdown_utils.py
View file @
d5dbed73
File moved
pdf_tools
/libs/nlp_utils.py
→
magic_pdf
/libs/nlp_utils.py
View file @
d5dbed73
...
...
@@ -10,7 +10,7 @@ import spacy
import
en_core_web_sm
import
zh_core_web_sm
from
pdf_tools
.libs.language
import
detect_lang
from
magic_pdf
.libs.language
import
detect_lang
class
NLPModels
:
...
...
pdf_tools
/libs/pdf_image_tools.py
→
magic_pdf
/libs/pdf_image_tools.py
View file @
d5dbed73
...
...
@@ -4,9 +4,9 @@ from typing import Tuple
import
io
# from app.common.s3 import get_s3_client
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
loguru
import
logger
from
pdf_tools
.libs.commons
import
parse_bucket_key
,
join_path
from
magic_pdf
.libs.commons
import
parse_bucket_key
,
join_path
def
cut_image
(
bbox
:
Tuple
,
page_num
:
int
,
page
:
fitz
.
Page
,
save_parent_path
:
str
,
s3_return_path
=
None
,
img_s3_client
=
None
,
upload_switch
=
True
):
...
...
pdf_tools
/libs/safe_filename.py
→
magic_pdf
/libs/safe_filename.py
View file @
d5dbed73
File moved
pdf_tools
/libs/textbase.py
→
magic_pdf
/libs/textbase.py
View file @
d5dbed73
File moved
pdf_tools
/libs/vis_utils.py
→
magic_pdf
/libs/vis_utils.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
import
os
...
...
pdf_tools
/para/__init__.py
→
magic_pdf
/para/__init__.py
View file @
d5dbed73
File moved
pdf_tools
/para/block_continuation_processor.py
→
magic_pdf
/para/block_continuation_processor.py
View file @
d5dbed73
import
os
import
unicodedata
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/block_termination_processor.py
→
magic_pdf
/para/block_termination_processor.py
View file @
d5dbed73
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/commons.py
→
magic_pdf
/para/commons.py
View file @
d5dbed73
import
sys
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
termcolor
import
cprint
...
...
pdf_tools
/para/denoise.py
→
magic_pdf
/para/denoise.py
View file @
d5dbed73
import
math
from
collections
import
defaultdict
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
sys
.
stdout
.
reconfigure
(
encoding
=
"utf-8"
)
# type: ignore
...
...
pdf_tools
/para/draw.py
→
magic_pdf
/para/draw.py
View file @
d5dbed73
from
pdf_tools
.libs.commons
import
fitz
from
magic_pdf
.libs.commons
import
fitz
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/exceptions.py
→
magic_pdf
/para/exceptions.py
View file @
d5dbed73
File moved
pdf_tools
/para/layout_match_processor.py
→
magic_pdf
/para/layout_match_processor.py
View file @
d5dbed73
import
math
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
pdf_tools
/para/para_pipeline.py
→
magic_pdf
/para/para_pipeline.py
View file @
d5dbed73
import
os
import
json
from
pdf_tools
.para.commons
import
*
from
pdf_tools
.para.raw_processor
import
RawBlockProcessor
from
pdf_tools
.para.layout_match_processor
import
LayoutFilterProcessor
from
pdf_tools
.para.stats
import
BlockStatisticsCalculator
from
pdf_tools
.para.stats
import
DocStatisticsCalculator
from
pdf_tools
.para.title_processor
import
TitleProcessor
from
pdf_tools
.para.block_termination_processor
import
BlockTerminationProcessor
from
pdf_tools
.para.block_continuation_processor
import
BlockContinuationProcessor
from
pdf_tools
.para.draw
import
DrawAnnos
from
pdf_tools
.para.exceptions
import
(
from
magic_pdf
.para.commons
import
*
from
magic_pdf
.para.raw_processor
import
RawBlockProcessor
from
magic_pdf
.para.layout_match_processor
import
LayoutFilterProcessor
from
magic_pdf
.para.stats
import
BlockStatisticsCalculator
from
magic_pdf
.para.stats
import
DocStatisticsCalculator
from
magic_pdf
.para.title_processor
import
TitleProcessor
from
magic_pdf
.para.block_termination_processor
import
BlockTerminationProcessor
from
magic_pdf
.para.block_continuation_processor
import
BlockContinuationProcessor
from
magic_pdf
.para.draw
import
DrawAnnos
from
magic_pdf
.para.exceptions
import
(
DenseSingleLineBlockException
,
TitleDetectionException
,
TitleLevelException
,
...
...
pdf_tools
/para/raw_processor.py
→
magic_pdf
/para/raw_processor.py
View file @
d5dbed73
File moved
pdf_tools
/para/stats.py
→
magic_pdf
/para/stats.py
View file @
d5dbed73
from
collections
import
Counter
import
numpy
as
np
from
pdf_tools
.para.commons
import
*
from
magic_pdf
.para.commons
import
*
if
sys
.
version_info
[
0
]
>=
3
:
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment