Merge pull request #838 from opendatalab/release-0.9.0

Release 0.9.0

Merge pull request #838 from opendatalab/release-0.9.0
Release 0.9.0
3a42ebbf · Xiaomeng Zhao · GitHub · 765c6d77 · 14024793 · 3a42ebbf
Unverified Commit 3a42ebbf authored Nov 01, 2024 by Xiaomeng Zhao Committed by GitHub Nov 01, 2024
20 changed files
--- a/projects/web_demo/web_demo/api/react_app/react_app_view.py
+++ b/projects/web_demo/web_demo/api/react_app/react_app_view.py
+from flask import render_template, Response
+from flask_restful import Resource
+
+
+class ReactAppView(Resource):
+    def get(self):
+        # 创建自定义的响应对象
+        rendered_template = render_template('index.html')
+        response = Response(rendered_template, mimetype='text/html')
+
+        return response
--- a/projects/web_demo/web_demo/app.py
+++ b/projects/web_demo/web_demo/app.py
+import socket
+from api import create_app
+from pathlib import Path
+import yaml
+
+
+def get_local_ip():
+    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    sock.connect(('8.8.8.8', 80))  # Google DNS 服务器
+    ip_address = sock.getsockname()[0]
+    sock.close()
+    return ip_address
+
+
+current_file_path = Path(__file__).resolve()
+base_dir = current_file_path.parent
+config_path = base_dir / "config/config.yaml"
+
+
+class ConfigMap(dict):
+    __setattr__ = dict.__setitem__
+    __getattr__ = dict.__getitem__
+
+
+with open(str(config_path), mode='r', encoding='utf-8') as fd:
+    data = yaml.load(fd, Loader=yaml.FullLoader)
+    _config = data.get(data.get("CurrentConfig", "DevelopmentConfig"))
+config = ConfigMap()
+for k, v in _config.items():
+    config[k] = v
+config['base_dir'] = base_dir
+database = _config.get("database")
+if database:
+    if database.get("type") == "sqlite":
+        database_uri = f'sqlite:///{base_dir}/{database.get("path")}'
+    elif database.get("type") == "mysql":
+        database_uri = f'mysql+pymysql://{database.get("user")}:{database.get("password")}@{database.get("host")}:{database.get("port")}/{database.get("database")}?'
+    else:
+        database_uri = ''
+    config['SQLALCHEMY_DATABASE_URI'] = database_uri
+
+ip_address = get_local_ip()
+port = config.get("PORT", 5559)
+# 配置 SERVER_NAME
+config['SERVER_NAME'] = f'{ip_address}:5559'
+# 配置 APPLICATION_ROOT
+config['APPLICATION_ROOT'] = '/'
+# 配置 PREFERRED_URL_SCHEME
+config['PREFERRED_URL_SCHEME'] = 'http'
+
+app = create_app(config)
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=port, debug=config.get("DEBUG", False))
--- a/projects/web_demo/web_demo/common/__init__.py
+++ b/projects/web_demo/web_demo/common/__init__.py
--- a/projects/web_demo/web_demo/common/custom_response.py
+++ b/projects/web_demo/web_demo/common/custom_response.py
+from flask import jsonify
+
+
+class ResponseCode:
+    SUCCESS = 200
+    PARAM_WARING = 400
+    MESSAGE = "success"
+
+
+def generate_response(data=None, code=ResponseCode.SUCCESS, msg=ResponseCode.MESSAGE, **kwargs):
+    """
+    自定义响应
+    :param code:状态码
+    :param data:返回数据
+    :param msg:返回消息
+    :param kwargs:
+    :return:
+    """
+    msg = msg or 'success' if code == 200 else msg or 'fail'
+    success = True if code == 200 else False
+    res = jsonify(dict(code=code, success=success, data=data, msg=msg, **kwargs))
+    res.status_code = 200
+    return res
--- a/projects/web_demo/web_demo/common/error_types.py
+++ b/projects/web_demo/web_demo/common/error_types.py
+import json
+from flask import request
+from werkzeug.exceptions import HTTPException
+
+
+class ApiException(HTTPException):
+    """API错误基类"""
+    code = 500
+    msg = 'Sorry, we made a mistake Σ(っ °Д °;)っ'
+    msgZH = ""
+    error_code = 999
+
+    def __init__(self, msg=None, msgZH=None, code=None, error_code=None, headers=None):
+        if code:
+            self.code = code
+        if msg:
+            self.msg = msg
+        if msgZH:
+            self.msgZH = msgZH
+        if error_code:
+            self.error_code = error_code
+        super(ApiException, self).__init__(msg, None)
+
+    @staticmethod
+    def get_error_url():
+        """获取出错路由和请求方式"""
+        method = request.method
+        full_path = str(request.full_path)
+        main_path = full_path.split('?')[0]
+        res = method + ' ' + main_path
+        return res
+
+    def get_body(self, environ=None, scope=None):
+        """异常返回信息"""
+        body = dict(
+            msg=self.msg,
+            error_code=self.error_code,
+            request=self.get_error_url()
+        )
+        text = json.dumps(body)
+        return text
+
+    def get_headers(self, environ=None, scope=None):
+        """异常返回格式"""
+        return [("Content-Type", "application/json")]
\ No newline at end of file
--- a/projects/web_demo/web_demo/common/ext.py
+++ b/projects/web_demo/web_demo/common/ext.py
+import hashlib
+import mimetypes
+
+
+def is_pdf(filename, file):
+    """
+    判断文件是否为PDF格式。
+
+    :param filename: 文件名
+    :param file: 文件对象
+    :return: 如果文件是PDF格式，则返回True，否则返回False
+    """
+    # 检查文件扩展名  https://arxiv.org/pdf/2405.08702 pdf链接可能存在不带扩展名的情况，先注释
+    # if not filename.endswith('.pdf'):
+    #     return False
+
+    # 检查MIME类型
+    mime_type, _ = mimetypes.guess_type(filename)
+    print(mime_type)
+    if mime_type != 'application/pdf':
+        return False
+
+    # 可选：读取文件的前几KB内容并检查MIME类型
+    # 这一步是可选的，用于更严格的检查
+    # if not mimetypes.guess_type(filename, strict=False)[0] == 'application/pdf':
+    #     return False
+
+    # 检查文件内容
+    file_start = file.read(5)
+    file.seek(0)
+    if not file_start.startswith(b'%PDF-'):
+        return False
+
+    return True
+
+
+def url_is_pdf(file):
+    """
+    判断文件是否为PDF格式。
+
+    :param file: 文件对象
+    :return: 如果文件是PDF格式，则返回True，否则返回False
+    """
+    # 检查文件内容
+    file_start = file.read(5)
+    file.seek(0)
+    if not file_start.startswith(b'%PDF-'):
+        return False
+
+    return True
+
+
+def calculate_file_hash(file, algorithm='sha256'):
+    """
+    计算给定文件的哈希值。
+
+    :param file: 文件对象
+    :param algorithm: 哈希算法的名字，如:'sha256', 'md5', 'sha1'等
+    :return: 文件的哈希值
+    """
+    hash_func = getattr(hashlib, algorithm)()
+    block_size = 65536  # 64KB chunks
+    # with open(file_path, 'rb') as file:
+    buffer = file.read(block_size)
+    while len(buffer) > 0:
+        hash_func.update(buffer)
+        buffer = file.read(block_size)
+    file.seek(0)
+    return hash_func.hexdigest()
+
+
+def singleton_func(cls):
+    instance = {}
+
+    def _singleton(*args, **kwargs):
+        if cls not in instance:
+            instance[cls] = cls(*args, **kwargs)
+        return instance[cls]
+
+    return _singleton
--- a/projects/web_demo/web_demo/common/import_models.py
+++ b/projects/web_demo/web_demo/common/import_models.py
+from api.analysis.models import *
\ No newline at end of file
--- a/projects/web_demo/web_demo/common/logger.py
+++ b/projects/web_demo/web_demo/common/logger.py
+import os
+from loguru import logger
+from pathlib import Path
+from datetime import datetime
+
+
+def setup_log(config):
+    """
+    Setup logging
+    :param config:  config file
+    :return:
+    """
+    log_path = os.path.join(Path(__file__).parent.parent, "log")
+    if not Path(log_path).exists():
+        Path(log_path).mkdir(parents=True, exist_ok=True)
+    log_level = config.get("LOG_LEVEL")
+    log_name = f'log_{datetime.now().strftime("%Y-%m-%d")}.log'
+    log_file_path = os.path.join(log_path, log_name)
+    logger.add(str(log_file_path), rotation='00:00', encoding='utf-8', level=log_level, enqueue=True)
--- a/projects/web_demo/web_demo/common/web_hook.py
+++ b/projects/web_demo/web_demo/common/web_hook.py
+
+def before_request():
+    return None
+
+
+def after_request(response):
+    response.headers.add('Access-Control-Allow-Origin', '*')
+    response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
+    return response
--- a/projects/web_demo/web_demo/config/__init__.py
+++ b/projects/web_demo/web_demo/config/__init__.py
--- a/projects/web_demo/web_demo/config/config.yaml
+++ b/projects/web_demo/web_demo/config/config.yaml
+# 基本配置
+BaseConfig: &base
+  DEBUG: false
+  PORT: 5559
+  LOG_LEVEL: "DEBUG"
+  SQLALCHEMY_TRACK_MODIFICATIONS: true
+  SQLALCHEMY_DATABASE_URI: ""
+  PROPAGATE_EXCEPTIONS: true
+  SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
+  JWT_SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
+  JWT_ACCESS_TOKEN_EXPIRES: 3600
+  PDF_UPLOAD_FOLDER: "upload_pdf"
+  PDF_ANALYSIS_FOLDER: "analysis_pdf"
+  # 前端项目打包的路径
+  REACT_APP_DIST: "../../web/dist/"
+
+# 开发配置
+DevelopmentConfig:
+  <<: *base
+  database:
+    type: sqlite
+    path: config/mineru_web.db
+
+# 生产配置
+ProductionConfig:
+  <<: *base
+
+# 测试配置
+TestingConfig:
+  <<: *base
+
+# 当前使用配置
+CurrentConfig: "DevelopmentConfig"
--- a/projects/web_demo/web_demo/config/mineru_web.db
+++ b/projects/web_demo/web_demo/config/mineru_web.db
--- a/projects/web_demo/web_demo/static/__init__.py
+++ b/projects/web_demo/web_demo/static/__init__.py
--- a/requirements-docker.txt
+++ b/requirements-docker.txt
@@ -5,14 +5,14 @@ PyMuPDF>=1.24.9
 loguru>=0.6.0
 numpy>=1.21.6,<2.0.0
 fast-langdetect==0.2.0
-wordninja>=2.0.0
 scikit-learn>=1.0.2
 pdfminer.six==20231228
-unimernet==0.1.6
+unimernet==0.2.1
 matplotlib
 ultralytics
 paddleocr==2.7.3
 paddlepaddle==3.0.0b1
 pypandoc
 struct-eqtable==0.1.0
+doclayout-yolo==0.0.2
 detectron2
--- a/requirements-qa.txt
+++ b/requirements-qa.txt
@@ -16,4 +16,5 @@ pypandoc
 pyopenssl==24.0.0
 struct-eqtable==0.1.0
 pytest-cov
-beautifulsoup4
\ No newline at end of file
+beautifulsoup4
+coverage
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,6 @@ pdfminer.six==20231228
 pydantic>=2.7.2,<2.8.0
 PyMuPDF>=1.24.9
 scikit-learn>=1.0.2
-wordninja>=2.0.0
+torch>=2.2.2,<=2.3.1
+transformers
 # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ if __name__ == '__main__':
                     "paddlepaddle==3.0.0b1;platform_system=='Linux'",
                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
                     ],
-            "full": ["unimernet==0.1.6",  # 0.1.6版本大幅裁剪依赖包范围，推荐使用此版本
+            "full": ["unimernet==0.2.1",  # unimernet升级0.2.1
                     "matplotlib<=3.9.0;platform_system=='Windows'",  # 3.9.1及之后不提供windows的预编译包，避免一些没有编译环境的windows设备安装失败
                     "matplotlib;platform_system=='Linux' or platform_system=='Darwin'",  # linux 和 macos 不应限制matplotlib的最高版本，以避免无法更新导致的一些bug
                     "ultralytics",  # yolov8,公式检测
@@ -45,6 +45,7 @@ if __name__ == '__main__':
                     "paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",  # windows版本3.0.0b1效率下降，需锁定2.6.1
                     "pypandoc",  # 表格解析latex转html
                     "struct-eqtable==0.1.0",  # 表格解析
+                     "doclayout_yolo==0.0.2",  # doclayout_yolo
                     "detectron2"
                     ],
        },

--- a/tests/clean_coverage.py
+++ b/tests/clean_coverage.py
+"""
+clean coverage
+"""
+import os
+import shutil
+
+def delete_file(path):
+    """delete file."""
+    if not os.path.exists(path):
+        if os.path.isfile(path):
+            try:
+                os.remove(path)
+                print(f"File '{path}' deleted.")
+            except TypeError as e:
+                print(f"Error deleting file '{path}': {e}")
+    elif os.path.isdir(path):
+        try:
+            shutil.rmtree(path)
+            print(f"Directory '{path}' and its contents deleted.")
+        except TypeError as e:
+            print(f"Error deleting directory '{path}': {e}")
+
+if __name__ == "__main__":
+    delete_file("htmlcov/")
+    #delete_file(".coverage")
--- a/tests/get_coverage.py
+++ b/tests/get_coverage.py
@@ -2,7 +2,7 @@
 get cov
 """
 from bs4 import BeautifulSoup
-
+import shutil
 def get_covrage():
    """get covrage"""
    # 发送请求获取网页内容

--- a/tests/retry_env.sh
+++ b/tests/retry_env.sh
 #!/bin/bash

-# 定义最大重试次数
 max_retries=5
 retry_count=0

 while true; do
    # prepare env
-    source activate MinerU
-    pip install -r requirements-qa.txt
-    pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
-    pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+    #python -m pip install -r requirements-qa.txt
+    python -m pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
+    python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
    exit_code=$?
    if [ $exit_code -eq 0 ]; then
        echo "test.sh 成功执行！"
@@ -21,6 +19,6 @@ while true; do
            exit 1
        fi
        echo "test.sh 执行失败 (退出码: $exit_code)。尝试第 $retry_count 次重试..."
-        sleep 5  # 等待 5 秒后重试
+        sleep 5
    fi
 done