Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wangsen
MinerU
Commits
53cd9103
Commit
53cd9103
authored
Jun 10, 2025
by
myhloli
Browse files
refactor: update project configuration and dependencies in pyproject.toml and setup.py
parent
20790663
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
170 additions
and
30 deletions
+170
-30
mineru/backend/vlm/vlm_magic_model.py
mineru/backend/vlm/vlm_magic_model.py
+35
-1
pyproject.toml
pyproject.toml
+100
-4
setup.py
setup.py
+35
-25
No files found.
mineru/backend/vlm/vlm_magic_model.py
View file @
53cd9103
...
@@ -200,7 +200,41 @@ def isolated_formula_clean(txt):
...
@@ -200,7 +200,41 @@ def isolated_formula_clean(txt):
latex
=
txt
[:]
latex
=
txt
[:]
if
latex
.
startswith
(
"
\\
["
):
latex
=
latex
[
2
:]
if
latex
.
startswith
(
"
\\
["
):
latex
=
latex
[
2
:]
if
latex
.
endswith
(
"
\\
]"
):
latex
=
latex
[:
-
2
]
if
latex
.
endswith
(
"
\\
]"
):
latex
=
latex
[:
-
2
]
return
latex
.
strip
()
latex
=
latex_fix
(
latex
.
strip
())
return
latex
def
latex_fix
(
latex
):
# 白名单分隔符
valid_delims_list
=
[
r
'('
,
r
')'
,
r
'['
,
r
']'
,
r
'{'
,
r
'}'
,
r
'/'
,
r
'|'
,
r
'\{'
,
r
'\}'
,
r
'\lceil'
,
r
'\rceil'
,
r
'\lfloor'
,
r
'\rfloor'
,
r
'\backslash'
,
r
'\uparrow'
,
r
'\downarrow'
,
r
'\Uparrow'
,
r
'\Downarrow'
,
r
'\|'
,
r
'\.'
]
# 为\left后缺失有效分隔符的情况添加点
def
fix_delim
(
match
):
cmd
=
match
.
group
(
1
)
# \left 或 \right
rest
=
match
.
group
(
2
)
if
len
(
match
.
groups
())
>
1
else
""
if
not
rest
or
rest
not
in
valid_delims_list
:
return
cmd
+
"."
return
match
.
group
(
0
)
LEFT_PATTERN
=
re
.
compile
(
r
'(\\left)(\S*)'
)
RIGHT_PATTERN
=
re
.
compile
(
r
'(\\right)(\S*)'
)
LEFT_COUNT_PATTERN
=
re
.
compile
(
r
'\\left(?![a-zA-Z])'
)
RIGHT_COUNT_PATTERN
=
re
.
compile
(
r
'\\right(?![a-zA-Z])'
)
LEFT_RIGHT_REMOVE_PATTERN
=
re
.
compile
(
r
'\\left\.?|\\right\.?'
)
latex
=
LEFT_PATTERN
.
sub
(
lambda
m
:
fix_delim
(
m
),
latex
)
latex
=
RIGHT_PATTERN
.
sub
(
lambda
m
:
fix_delim
(
m
),
latex
)
left_count
=
len
(
LEFT_COUNT_PATTERN
.
findall
(
latex
))
# 不匹配\lefteqn等
right_count
=
len
(
RIGHT_COUNT_PATTERN
.
findall
(
latex
))
# 不匹配\rightarrow
if
left_count
!=
right_count
:
return
LEFT_RIGHT_REMOVE_PATTERN
.
sub
(
''
,
latex
)
return
latex
def
__reduct_overlap
(
bboxes
):
def
__reduct_overlap
(
bboxes
):
...
...
pyproject.toml
View file @
53cd9103
[build-system]
requires
=
[
"setuptools>=61.0"
,
"wheel"
]
build-backend
=
"setuptools.build_meta"
[project]
name
=
"mineru"
dynamic
=
["version"]
license
=
{
text
=
"AGPL-3.0"
}
description
=
"A practical tool for converting PDF to Markdown"
readme
=
"README.md"
requires-python
=
">=3.10,<3.14"
keywords
=
[
"magic-pdf"
,
"mineru"
,
"MinerU"
,
"convert"
,
"pdf"
,
"markdown"
]
classifiers
=
[
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: Python :: 3.12"
,
"Programming Language :: Python :: 3.13"
,
]
dependencies
=
[
"boto3>=1.28.43"
,
"click>=8.1.7"
,
"loguru>=0.7.2"
,
"numpy>=1.21.6"
,
"pdfminer.six==20250506"
,
"tqdm>=4.67.1"
,
"requests"
,
"httpx"
,
"pillow>=11.0.0"
,
"pypdfium2>=4.30.0"
,
"pypdf>=5.6.0"
,
"reportlab"
,
"pdftext>=0.6.2"
]
[tool.black]
[project.optional-dependencies]
line-length
=
128
vlm
=
[
"transformers>=4.51.1"
,
"torch>=2.6.0"
,
"accelerate>=1.5.1"
,
"pydantic>=2.7.2,<2.11"
,
]
sglang
=
[
"sglang[all]==0.4.6.post5"
,
]
pipeline
=
[
"matplotlib>=3.10,<4"
,
"ultralytics>=8.3.48,<9"
,
"doclayout_yolo==0.0.4"
,
"dill>=0.3.8,<1"
,
"rapid_table>=1.0.5,<2.0.0"
,
"PyYAML>=6.0.2,<7"
,
"ftfy>=6.3.1,<7"
,
"openai>=1.70.0,<2"
,
"shapely>=2.0.7,<3"
,
"pyclipper>=1.3.0,<2"
,
"omegaconf>=2.3.0,<3"
,
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3"
,
"torchvision"
,
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
]
pipeline_old_linux
=
[
"matplotlib>=3.10,<=3.10.1"
,
"ultralytics>=8.3.48,<=8.3.104"
,
"doclayout_yolo==0.0.4"
,
"dill==0.3.8"
,
"PyYAML==6.0.2"
,
"ftfy==6.3.1"
,
"openai==1.71.0"
,
"shapely==2.1.0"
,
"pyclipper==1.3.0.post6"
,
"omegaconf==2.3.0"
,
"albumentations==1.4.20"
,
"rapid_table==1.0.3"
,
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3"
,
"torchvision"
,
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
]
[tool.ruff]
[project.urls]
line-length
=
128
Home
=
"https://mineru.net/"
Repository
=
"https://github.com/opendatalab/MinerU"
[project.scripts]
mineru
=
"mineru.cli:client.main"
mineru-sglang-server
=
"mineru.cli.vlm-sglang_server:main"
mineru-models-download
=
"mineru.cli.models_download:download_models"
[tool.setuptools.dynamic]
version
=
{
attr
=
"mineru.version.__version__"
}
[tool.setuptools.packages.find]
include
=
["mineru*"]
namespaces
=
false
[tool.setuptools.package-data]
"mineru.resources"
=
["**"]
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources"
=
["**"]
[tool.setuptools]
include-package-data
=
true
zip-safe
=
false
setup.py
View file @
53cd9103
...
@@ -3,22 +3,6 @@ from setuptools import setup, find_packages
...
@@ -3,22 +3,6 @@ from setuptools import setup, find_packages
from
mineru.version
import
__version__
from
mineru.version
import
__version__
def
parse_requirements
(
filename
):
with
open
(
filename
)
as
f
:
lines
=
f
.
read
().
splitlines
()
requires
=
[]
for
line
in
lines
:
if
"http"
in
line
:
pkg_name_without_url
=
line
.
split
(
'@'
)[
0
].
strip
()
requires
.
append
(
pkg_name_without_url
)
else
:
requires
.
append
(
line
)
return
requires
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
with
Path
(
Path
(
__file__
).
parent
,
with
Path
(
Path
(
__file__
).
parent
,
'README.md'
).
open
(
encoding
=
'utf-8'
)
as
file
:
'README.md'
).
open
(
encoding
=
'utf-8'
)
as
file
:
...
@@ -32,17 +16,35 @@ if __name__ == '__main__':
...
@@ -32,17 +16,35 @@ if __name__ == '__main__':
"mineru.resources"
:
[
"**"
],
# 包含magic_pdf.resources目录下的所有文件
"mineru.resources"
:
[
"**"
],
# 包含magic_pdf.resources目录下的所有文件
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources"
:
[
"**"
],
# pytorchocr.resources目录下的所有文件
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources"
:
[
"**"
],
# pytorchocr.resources目录下的所有文件
},
},
install_requires
=
parse_requirements
(
'requirements.txt'
),
# 项目依赖的第三方库
install_requires
=
[
"boto3>=1.28.43"
,
"click>=8.1.7"
,
"loguru>=0.6.0"
,
"numpy>=1.21.6"
,
"pdfminer.six==20250506"
,
"tqdm>=4.67.1"
,
"requests"
,
"httpx"
,
"pillow"
,
"pypdfium2"
,
"loguru"
,
"pypdf"
,
"reportlab"
,
],
# 项目依赖的第三方库
extras_require
=
{
extras_require
=
{
"lite"
:
[
"vlm"
:[
"paddleocr==2.7.3"
,
"transformers>=4.51.1"
,
"paddlepaddle==3.0.0b1;platform_system=='Linux'"
,
"torch>=2.6.0"
,
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'"
,
"accelerate>=1.5.1"
"pydantic>=2.7.2,<2.11"
,
],
"sglang"
:
[
"sglang[all]==0.4.6.post5"
,
],
],
"
full
"
:
[
"
pipeline
"
:
[
"matplotlib>=3.10,<4"
,
"matplotlib>=3.10,<4"
,
"ultralytics>=8.3.48,<9"
,
# yolov8,公式检测
"ultralytics>=8.3.48,<9"
,
# yolov8,公式检测
"doclayout_yolo==0.0.
2b1
"
,
# doclayout_yolo
"doclayout_yolo==0.0.
4
"
,
# doclayout_yolo
"dill>=0.3.8,<1"
,
# doclayout_yolo
"dill>=0.3.8,<1"
,
# doclayout_yolo
"rapid_table>=1.0.5,<2.0.0"
,
# rapid_table
"rapid_table>=1.0.5,<2.0.0"
,
# rapid_table
"PyYAML>=6.0.2,<7"
,
# yaml
"PyYAML>=6.0.2,<7"
,
# yaml
...
@@ -51,11 +53,15 @@ if __name__ == '__main__':
...
@@ -51,11 +53,15 @@ if __name__ == '__main__':
"shapely>=2.0.7,<3"
,
# imgaug-paddleocr2pytorch
"shapely>=2.0.7,<3"
,
# imgaug-paddleocr2pytorch
"pyclipper>=1.3.0,<2"
,
# paddleocr2pytorch
"pyclipper>=1.3.0,<2"
,
# paddleocr2pytorch
"omegaconf>=2.3.0,<3"
,
# paddleocr2pytorch
"omegaconf>=2.3.0,<3"
,
# paddleocr2pytorch
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3"
,
"torchvision"
,
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
],
],
"
full
_old_linux"
:
[
"
pipeline
_old_linux"
:
[
"matplotlib>=3.10,<=3.10.1"
,
"matplotlib>=3.10,<=3.10.1"
,
"ultralytics>=8.3.48,<=8.3.104"
,
# yolov8,公式检测
"ultralytics>=8.3.48,<=8.3.104"
,
# yolov8,公式检测
"doclayout_yolo==0.0.
2b1
"
,
# doclayout_yolo
"doclayout_yolo==0.0.
4
"
,
# doclayout_yolo
"dill==0.3.8"
,
# doclayout_yolo
"dill==0.3.8"
,
# doclayout_yolo
"PyYAML==6.0.2"
,
# yaml
"PyYAML==6.0.2"
,
# yaml
"ftfy==6.3.1"
,
# unimernet_hf
"ftfy==6.3.1"
,
# unimernet_hf
...
@@ -65,6 +71,10 @@ if __name__ == '__main__':
...
@@ -65,6 +71,10 @@ if __name__ == '__main__':
"omegaconf==2.3.0"
,
# paddleocr2pytorch
"omegaconf==2.3.0"
,
# paddleocr2pytorch
"albumentations==1.4.20"
,
# 1.4.21引入的simsimd不支持2019年及更早的linux系统
"albumentations==1.4.20"
,
# 1.4.21引入的simsimd不支持2019年及更早的linux系统
"rapid_table==1.0.3"
,
# rapid_table新版本依赖的onnxruntime不支持2019年及更早的linux系统
"rapid_table==1.0.3"
,
# rapid_table新版本依赖的onnxruntime不支持2019年及更早的linux系统
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3"
,
"torchvision"
,
"transformers>=4.49.0,!=4.51.0,<5.0.0"
,
"fast-langdetect>=0.2.3,<0.3.0"
,
],
],
},
},
description
=
"A practical tool for converting PDF to Markdown"
,
# 简短描述
description
=
"A practical tool for converting PDF to Markdown"
,
# 简短描述
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment