Commit 53cd9103 authored by myhloli's avatar myhloli
Browse files

refactor: update project configuration and dependencies in pyproject.toml and setup.py

parent 20790663
......@@ -200,7 +200,41 @@ def isolated_formula_clean(txt):
latex = txt[:]
if latex.startswith("\\["): latex = latex[2:]
if latex.endswith("\\]"): latex = latex[:-2]
return latex.strip()
latex = latex_fix(latex.strip())
return latex
def latex_fix(latex):
# 白名单分隔符
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
r'\Uparrow', r'\Downarrow', r'\|', r'\.']
# 为\left后缺失有效分隔符的情况添加点
def fix_delim(match):
cmd = match.group(1) # \left 或 \right
rest = match.group(2) if len(match.groups()) > 1 else ""
if not rest or rest not in valid_delims_list:
return cmd + "."
return match.group(0)
LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
latex = LEFT_PATTERN.sub(lambda m: fix_delim(m), latex)
latex = RIGHT_PATTERN.sub(lambda m: fix_delim(m), latex)
left_count = len(LEFT_COUNT_PATTERN.findall(latex)) # 不匹配\lefteqn等
right_count = len(RIGHT_COUNT_PATTERN.findall(latex)) # 不匹配\rightarrow
if left_count != right_count:
return LEFT_RIGHT_REMOVE_PATTERN.sub('', latex)
return latex
def __reduct_overlap(bboxes):
......
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "mineru"
dynamic = ["version"]
license = {text = "AGPL-3.0"}
description = "A practical tool for converting PDF to Markdown"
readme = "README.md"
requires-python = ">=3.10,<3.14"
keywords = ["magic-pdf", "mineru", "MinerU", "convert", "pdf", "markdown"]
classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dependencies = [
"boto3>=1.28.43",
"click>=8.1.7",
"loguru>=0.7.2",
"numpy>=1.21.6",
"pdfminer.six==20250506",
"tqdm>=4.67.1",
"requests",
"httpx",
"pillow>=11.0.0",
"pypdfium2>=4.30.0",
"pypdf>=5.6.0",
"reportlab",
"pdftext>=0.6.2"
]
[tool.black]
line-length = 128
[project.optional-dependencies]
vlm = [
"transformers>=4.51.1",
"torch>=2.6.0",
"accelerate>=1.5.1",
"pydantic>=2.7.2,<2.11",
]
sglang = [
"sglang[all]==0.4.6.post5",
]
pipeline = [
"matplotlib>=3.10,<4",
"ultralytics>=8.3.48,<9",
"doclayout_yolo==0.0.4",
"dill>=0.3.8,<1",
"rapid_table>=1.0.5,<2.0.0",
"PyYAML>=6.0.2,<7",
"ftfy>=6.3.1,<7",
"openai>=1.70.0,<2",
"shapely>=2.0.7,<3",
"pyclipper>=1.3.0,<2",
"omegaconf>=2.3.0,<3",
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
pipeline_old_linux = [
"matplotlib>=3.10,<=3.10.1",
"ultralytics>=8.3.48,<=8.3.104",
"doclayout_yolo==0.0.4",
"dill==0.3.8",
"PyYAML==6.0.2",
"ftfy==6.3.1",
"openai==1.71.0",
"shapely==2.1.0",
"pyclipper==1.3.0.post6",
"omegaconf==2.3.0",
"albumentations==1.4.20",
"rapid_table==1.0.3",
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
[tool.ruff]
line-length = 128
[project.urls]
Home = "https://mineru.net/"
Repository = "https://github.com/opendatalab/MinerU"
[project.scripts]
mineru = "mineru.cli:client.main"
mineru-sglang-server = "mineru.cli.vlm-sglang_server:main"
mineru-models-download = "mineru.cli.models_download:download_models"
[tool.setuptools.dynamic]
version = {attr = "mineru.version.__version__"}
[tool.setuptools.packages.find]
include = ["mineru*"]
namespaces = false
[tool.setuptools.package-data]
"mineru.resources" = ["**"]
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources" = ["**"]
[tool.setuptools]
include-package-data = true
zip-safe = false
......@@ -3,22 +3,6 @@ from setuptools import setup, find_packages
from mineru.version import __version__
def parse_requirements(filename):
with open(filename) as f:
lines = f.read().splitlines()
requires = []
for line in lines:
if "http" in line:
pkg_name_without_url = line.split('@')[0].strip()
requires.append(pkg_name_without_url)
else:
requires.append(line)
return requires
if __name__ == '__main__':
with Path(Path(__file__).parent,
'README.md').open(encoding='utf-8') as file:
......@@ -32,17 +16,35 @@ if __name__ == '__main__':
"mineru.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件
"mineru.model.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"], # pytorchocr.resources目录下的所有文件
},
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
install_requires=[
"boto3>=1.28.43",
"click>=8.1.7",
"loguru>=0.6.0",
"numpy>=1.21.6",
"pdfminer.six==20250506",
"tqdm>=4.67.1",
"requests",
"httpx",
"pillow",
"pypdfium2",
"loguru",
"pypdf",
"reportlab",
], # 项目依赖的第三方库
extras_require={
"lite": [
"paddleocr==2.7.3",
"paddlepaddle==3.0.0b1;platform_system=='Linux'",
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
"vlm":[
"transformers>=4.51.1",
"torch>=2.6.0",
"accelerate>=1.5.1"
"pydantic>=2.7.2,<2.11",
],
"sglang": [
"sglang[all]==0.4.6.post5",
],
"full": [
"pipeline": [
"matplotlib>=3.10,<4",
"ultralytics>=8.3.48,<9", # yolov8,公式检测
"doclayout_yolo==0.0.2b1", # doclayout_yolo
"doclayout_yolo==0.0.4", # doclayout_yolo
"dill>=0.3.8,<1", # doclayout_yolo
"rapid_table>=1.0.5,<2.0.0", # rapid_table
"PyYAML>=6.0.2,<7", # yaml
......@@ -51,11 +53,15 @@ if __name__ == '__main__':
"shapely>=2.0.7,<3", # imgaug-paddleocr2pytorch
"pyclipper>=1.3.0,<2", # paddleocr2pytorch
"omegaconf>=2.3.0,<3", # paddleocr2pytorch
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
],
"full_old_linux": [
"pipeline_old_linux": [
"matplotlib>=3.10,<=3.10.1",
"ultralytics>=8.3.48,<=8.3.104", # yolov8,公式检测
"doclayout_yolo==0.0.2b1", # doclayout_yolo
"doclayout_yolo==0.0.4", # doclayout_yolo
"dill==0.3.8", # doclayout_yolo
"PyYAML==6.0.2", # yaml
"ftfy==6.3.1", # unimernet_hf
......@@ -65,6 +71,10 @@ if __name__ == '__main__':
"omegaconf==2.3.0", # paddleocr2pytorch
"albumentations==1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
"rapid_table==1.0.3", # rapid_table新版本依赖的onnxruntime不支持2019年及更早的linux系统
"torch>=2.2.2,!=2.5.0,!=2.5.1,<3",
"torchvision",
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
],
},
description="A practical tool for converting PDF to Markdown", # 简短描述
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment