Unverified Commit 3a42ebbf authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #838 from opendatalab/release-0.9.0

Release 0.9.0
parents 765c6d77 14024793
from flask import render_template, Response
from flask_restful import Resource
class ReactAppView(Resource):
def get(self):
# 创建自定义的响应对象
rendered_template = render_template('index.html')
response = Response(rendered_template, mimetype='text/html')
return response
import socket
from api import create_app
from pathlib import Path
import yaml
def get_local_ip():
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.connect(('8.8.8.8', 80)) # Google DNS 服务器
ip_address = sock.getsockname()[0]
sock.close()
return ip_address
current_file_path = Path(__file__).resolve()
base_dir = current_file_path.parent
config_path = base_dir / "config/config.yaml"
class ConfigMap(dict):
__setattr__ = dict.__setitem__
__getattr__ = dict.__getitem__
with open(str(config_path), mode='r', encoding='utf-8') as fd:
data = yaml.load(fd, Loader=yaml.FullLoader)
_config = data.get(data.get("CurrentConfig", "DevelopmentConfig"))
config = ConfigMap()
for k, v in _config.items():
config[k] = v
config['base_dir'] = base_dir
database = _config.get("database")
if database:
if database.get("type") == "sqlite":
database_uri = f'sqlite:///{base_dir}/{database.get("path")}'
elif database.get("type") == "mysql":
database_uri = f'mysql+pymysql://{database.get("user")}:{database.get("password")}@{database.get("host")}:{database.get("port")}/{database.get("database")}?'
else:
database_uri = ''
config['SQLALCHEMY_DATABASE_URI'] = database_uri
ip_address = get_local_ip()
port = config.get("PORT", 5559)
# 配置 SERVER_NAME
config['SERVER_NAME'] = f'{ip_address}:5559'
# 配置 APPLICATION_ROOT
config['APPLICATION_ROOT'] = '/'
# 配置 PREFERRED_URL_SCHEME
config['PREFERRED_URL_SCHEME'] = 'http'
app = create_app(config)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=port, debug=config.get("DEBUG", False))
from flask import jsonify
class ResponseCode:
SUCCESS = 200
PARAM_WARING = 400
MESSAGE = "success"
def generate_response(data=None, code=ResponseCode.SUCCESS, msg=ResponseCode.MESSAGE, **kwargs):
"""
自定义响应
:param code:状态码
:param data:返回数据
:param msg:返回消息
:param kwargs:
:return:
"""
msg = msg or 'success' if code == 200 else msg or 'fail'
success = True if code == 200 else False
res = jsonify(dict(code=code, success=success, data=data, msg=msg, **kwargs))
res.status_code = 200
return res
import json
from flask import request
from werkzeug.exceptions import HTTPException
class ApiException(HTTPException):
"""API错误基类"""
code = 500
msg = 'Sorry, we made a mistake Σ(っ °Д °;)っ'
msgZH = ""
error_code = 999
def __init__(self, msg=None, msgZH=None, code=None, error_code=None, headers=None):
if code:
self.code = code
if msg:
self.msg = msg
if msgZH:
self.msgZH = msgZH
if error_code:
self.error_code = error_code
super(ApiException, self).__init__(msg, None)
@staticmethod
def get_error_url():
"""获取出错路由和请求方式"""
method = request.method
full_path = str(request.full_path)
main_path = full_path.split('?')[0]
res = method + ' ' + main_path
return res
def get_body(self, environ=None, scope=None):
"""异常返回信息"""
body = dict(
msg=self.msg,
error_code=self.error_code,
request=self.get_error_url()
)
text = json.dumps(body)
return text
def get_headers(self, environ=None, scope=None):
"""异常返回格式"""
return [("Content-Type", "application/json")]
\ No newline at end of file
import hashlib
import mimetypes
def is_pdf(filename, file):
"""
判断文件是否为PDF格式。
:param filename: 文件名
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件扩展名 https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况,先注释
# if not filename.endswith('.pdf'):
# return False
# 检查MIME类型
mime_type, _ = mimetypes.guess_type(filename)
print(mime_type)
if mime_type != 'application/pdf':
return False
# 可选:读取文件的前几KB内容并检查MIME类型
# 这一步是可选的,用于更严格的检查
# if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
# return False
# 检查文件内容
file_start = file.read(5)
file.seek(0)
if not file_start.startswith(b'%PDF-'):
return False
return True
def url_is_pdf(file):
"""
判断文件是否为PDF格式。
:param file: 文件对象
:return: 如果文件是PDF格式,则返回True,否则返回False
"""
# 检查文件内容
file_start = file.read(5)
file.seek(0)
if not file_start.startswith(b'%PDF-'):
return False
return True
def calculate_file_hash(file, algorithm='sha256'):
"""
计算给定文件的哈希值。
:param file: 文件对象
:param algorithm: 哈希算法的名字,如:'sha256', 'md5', 'sha1'等
:return: 文件的哈希值
"""
hash_func = getattr(hashlib, algorithm)()
block_size = 65536 # 64KB chunks
# with open(file_path, 'rb') as file:
buffer = file.read(block_size)
while len(buffer) > 0:
hash_func.update(buffer)
buffer = file.read(block_size)
file.seek(0)
return hash_func.hexdigest()
def singleton_func(cls):
instance = {}
def _singleton(*args, **kwargs):
if cls not in instance:
instance[cls] = cls(*args, **kwargs)
return instance[cls]
return _singleton
from api.analysis.models import *
\ No newline at end of file
import os
from loguru import logger
from pathlib import Path
from datetime import datetime
def setup_log(config):
"""
Setup logging
:param config: config file
:return:
"""
log_path = os.path.join(Path(__file__).parent.parent, "log")
if not Path(log_path).exists():
Path(log_path).mkdir(parents=True, exist_ok=True)
log_level = config.get("LOG_LEVEL")
log_name = f'log_{datetime.now().strftime("%Y-%m-%d")}.log'
log_file_path = os.path.join(log_path, log_name)
logger.add(str(log_file_path), rotation='00:00', encoding='utf-8', level=log_level, enqueue=True)
def before_request():
return None
def after_request(response):
response.headers.add('Access-Control-Allow-Origin', '*')
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
return response
# 基本配置
BaseConfig: &base
DEBUG: false
PORT: 5559
LOG_LEVEL: "DEBUG"
SQLALCHEMY_TRACK_MODIFICATIONS: true
SQLALCHEMY_DATABASE_URI: ""
PROPAGATE_EXCEPTIONS: true
SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
JWT_SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
JWT_ACCESS_TOKEN_EXPIRES: 3600
PDF_UPLOAD_FOLDER: "upload_pdf"
PDF_ANALYSIS_FOLDER: "analysis_pdf"
# 前端项目打包的路径
REACT_APP_DIST: "../../web/dist/"
# 开发配置
DevelopmentConfig:
<<: *base
database:
type: sqlite
path: config/mineru_web.db
# 生产配置
ProductionConfig:
<<: *base
# 测试配置
TestingConfig:
<<: *base
# 当前使用配置
CurrentConfig: "DevelopmentConfig"
......@@ -5,14 +5,14 @@ PyMuPDF>=1.24.9
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
fast-langdetect==0.2.0
wordninja>=2.0.0
scikit-learn>=1.0.2
pdfminer.six==20231228
unimernet==0.1.6
unimernet==0.2.1
matplotlib
ultralytics
paddleocr==2.7.3
paddlepaddle==3.0.0b1
pypandoc
struct-eqtable==0.1.0
doclayout-yolo==0.0.2
detectron2
......@@ -16,4 +16,5 @@ pypandoc
pyopenssl==24.0.0
struct-eqtable==0.1.0
pytest-cov
beautifulsoup4
\ No newline at end of file
beautifulsoup4
coverage
\ No newline at end of file
......@@ -8,5 +8,6 @@ pdfminer.six==20231228
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
scikit-learn>=1.0.2
wordninja>=2.0.0
torch>=2.2.2,<=2.3.1
transformers
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
......@@ -36,7 +36,7 @@ if __name__ == '__main__':
"paddlepaddle==3.0.0b1;platform_system=='Linux'",
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
],
"full": ["unimernet==0.1.6", # 0.1.6版本大幅裁剪依赖包范围,推荐使用此版本
"full": ["unimernet==0.2.1", # unimernet升级0.2.1
"matplotlib<=3.9.0;platform_system=='Windows'", # 3.9.1及之后不提供windows的预编译包,避免一些没有编译环境的windows设备安装失败
"matplotlib;platform_system=='Linux' or platform_system=='Darwin'", # linux 和 macos 不应限制matplotlib的最高版本,以避免无法更新导致的一些bug
"ultralytics", # yolov8,公式检测
......@@ -45,6 +45,7 @@ if __name__ == '__main__':
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'", # windows版本3.0.0b1效率下降,需锁定2.6.1
"pypandoc", # 表格解析latex转html
"struct-eqtable==0.1.0", # 表格解析
"doclayout_yolo==0.0.2", # doclayout_yolo
"detectron2"
],
},
......
"""
clean coverage
"""
import os
import shutil
def delete_file(path):
"""delete file."""
if not os.path.exists(path):
if os.path.isfile(path):
try:
os.remove(path)
print(f"File '{path}' deleted.")
except TypeError as e:
print(f"Error deleting file '{path}': {e}")
elif os.path.isdir(path):
try:
shutil.rmtree(path)
print(f"Directory '{path}' and its contents deleted.")
except TypeError as e:
print(f"Error deleting directory '{path}': {e}")
if __name__ == "__main__":
delete_file("htmlcov/")
#delete_file(".coverage")
......@@ -2,7 +2,7 @@
get cov
"""
from bs4 import BeautifulSoup
import shutil
def get_covrage():
"""get covrage"""
# 发送请求获取网页内容
......
#!/bin/bash
# 定义最大重试次数
max_retries=5
retry_count=0
while true; do
# prepare env
source activate MinerU
pip install -r requirements-qa.txt
pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
#python -m pip install -r requirements-qa.txt
python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "test.sh 成功执行!"
......@@ -21,6 +19,6 @@ while true; do
exit 1
fi
echo "test.sh 执行失败 (退出码: $exit_code)。尝试第 $retry_count 次重试..."
sleep 5 # 等待 5 秒后重试
sleep 5
fi
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment