Unverified Commit ece7f8d5 authored by Kaiwen Liu's avatar Kaiwen Liu Committed by GitHub
Browse files

Merge pull request #6 from opendatalab/dev

Dev
parents 98362a6e 702b6ac9
flask-cors
flask-jwt-extended
flask-marshmallow
flask-migrate
flask-restful
flask-sqlalchemy
flask
greenlet
loguru
marshmallow-sqlalchemy
marshmallow
pyjwt
pyyaml
...@@ -4,7 +4,7 @@ from common.web_hook import before_request ...@@ -4,7 +4,7 @@ from common.web_hook import before_request
from common.logger import setup_log from common.logger import setup_log
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
print("root_dir", root_dir)
def _register_db(flask_app): def _register_db(flask_app):
from common import import_models from common import import_models
...@@ -30,6 +30,8 @@ def create_app(config): ...@@ -30,6 +30,8 @@ def create_app(config):
ma.init_app(app=app) ma.init_app(app=app)
from .analysis import analysis_blue from .analysis import analysis_blue
app.register_blueprint(analysis_blue) app.register_blueprint(analysis_blue)
from .react_app import react_app_blue
app.register_blueprint(react_app_blue)
app.before_request(before_request) app.before_request(before_request)
......
...@@ -4,6 +4,7 @@ from .upload_view import UploadPdfView ...@@ -4,6 +4,7 @@ from .upload_view import UploadPdfView
from .analysis_view import AnalysisTaskView, AnalysisTaskProgressView from .analysis_view import AnalysisTaskView, AnalysisTaskProgressView
from .img_md_view import ImgView, MdView from .img_md_view import ImgView, MdView
from .task_view import TaskView, HistoricalTasksView, DeleteTaskView from .task_view import TaskView, HistoricalTasksView, DeleteTaskView
from .markdown_view import MarkdownView
analysis_blue = Blueprint('analysis', __name__) analysis_blue = Blueprint('analysis', __name__)
...@@ -15,4 +16,5 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img') ...@@ -15,4 +16,5 @@ api_v2.add_resource(ImgView, '/analysis/pdf_img')
api_v2.add_resource(MdView, '/analysis/pdf_md') api_v2.add_resource(MdView, '/analysis/pdf_md')
api_v2.add_resource(TaskView, '/extract/taskQueue') api_v2.add_resource(TaskView, '/extract/taskQueue')
api_v2.add_resource(HistoricalTasksView, '/extract/list') api_v2.add_resource(HistoricalTasksView, '/extract/list')
api_v2.add_resource(DeleteTaskView, '/extract/task') api_v2.add_resource(DeleteTaskView, '/extract/task/<int:id>')
\ No newline at end of file api_v2.add_resource(MarkdownView, '/extract/markdown')
\ No newline at end of file
import json import json
import threading import threading
from multiprocessing import Process
from pathlib import Path from pathlib import Path
from flask import request, current_app, url_for from flask import request, current_app, url_for
from flask_restful import Resource from flask_restful import Resource
...@@ -29,12 +30,15 @@ class AnalysisTaskProgressView(Resource): ...@@ -29,12 +30,15 @@ class AnalysisTaskProgressView(Resource):
case 'pdf': case 'pdf':
analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == analysis_task.analysis_pdf_id).first() analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == analysis_task.analysis_pdf_id).first()
file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False) file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
file_name_split = analysis_task.file_name.split("_")
file_name = file_name_split[-1] if file_name_split else analysis_task.file_name
if analysis_task.status == 0: if analysis_task.status == 0:
data = { data = {
"state": task_state_map.get(analysis_task.status), "state": task_state_map.get(analysis_task.status),
"status": analysis_pdf.status, "status": analysis_pdf.status,
"url": file_url, "url": file_url,
"fileName": analysis_task.file_name, "fileName": file_name,
"file_key": analysis_task.file_key,
"content": [], "content": [],
"markdownUrl": [], "markdownUrl": [],
"fullMdLink": "", "fullMdLink": "",
...@@ -50,7 +54,8 @@ class AnalysisTaskProgressView(Resource): ...@@ -50,7 +54,8 @@ class AnalysisTaskProgressView(Resource):
"state": task_state_map.get(analysis_task.status), "state": task_state_map.get(analysis_task.status),
"status": analysis_pdf.status, "status": analysis_pdf.status,
"url": file_url, "url": file_url,
"fileName": analysis_task.file_name, "fileName": file_name,
"file_key": analysis_task.file_key,
"content": bbox_info, "content": bbox_info,
"markdownUrl": md_link_list, "markdownUrl": md_link_list,
"fullMdLink": full_md_link, "fullMdLink": full_md_link,
...@@ -59,10 +64,11 @@ class AnalysisTaskProgressView(Resource): ...@@ -59,10 +64,11 @@ class AnalysisTaskProgressView(Resource):
return generate_response(data=data) return generate_response(data=data)
else: # 任务异常结束 else: # 任务异常结束
data = { data = {
"state": task_state_map.get(analysis_task.status), "state": "failed",
"status": analysis_pdf.status, "status": analysis_pdf.status,
"url": file_url, "url": file_url,
"fileName": analysis_task.file_name, "fileName": file_name,
"file_key": analysis_task.file_key,
"content": [], "content": [],
"markdownUrl": [], "markdownUrl": [],
"fullMdLink": "", "fullMdLink": "",
...@@ -75,7 +81,8 @@ class AnalysisTaskProgressView(Resource): ...@@ -75,7 +81,8 @@ class AnalysisTaskProgressView(Resource):
"state": task_state_map.get(analysis_task.status), "state": task_state_map.get(analysis_task.status),
"status": analysis_pdf.status, "status": analysis_pdf.status,
"url": file_url, "url": file_url,
"fileName": analysis_task.file_name, "fileName": file_name,
"file_key": analysis_task.file_key,
"content": [], "content": [],
"markdownUrl": [], "markdownUrl": [],
"fullMdLink": "", "fullMdLink": "",
...@@ -83,13 +90,13 @@ class AnalysisTaskProgressView(Resource): ...@@ -83,13 +90,13 @@ class AnalysisTaskProgressView(Resource):
} }
return generate_response(data=data) return generate_response(data=data)
case 'formula-detect': case 'formula-detect':
pass return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
case 'formula-extract': case 'formula-extract':
pass return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
case 'table-recogn': case 'table-recogn':
return generate_response(code=400, msg="Not yet supported", msgZH="尚不支持") return generate_response(code=400, msg="Not yet supported", msgZH="功能待开发")
case _: case _:
return generate_response() return generate_response(code=400, msg="Not yet supported", msgZH="参数不支持")
class AnalysisTaskView(Resource): class AnalysisTaskView(Resource):
...@@ -181,6 +188,8 @@ class AnalysisTaskView(Resource): ...@@ -181,6 +188,8 @@ class AnalysisTaskView(Resource):
params = json.loads(request.data) params = json.loads(request.data)
id = params.get('id') id = params.get('id')
analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id).first() analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id).first()
if not analysis_task:
return generate_response(code=400, msg="Invalid ID", msgZH="无效id")
match analysis_task.task_type: match analysis_task.task_type:
case 'pdf': case 'pdf':
task_r_p = AnalysisTask.query.filter(AnalysisTask.status.in_([0, 2])).first() task_r_p = AnalysisTask.query.filter(AnalysisTask.status.in_([0, 2])).first()
...@@ -208,16 +217,18 @@ class AnalysisTaskView(Resource): ...@@ -208,16 +217,18 @@ class AnalysisTaskView(Resource):
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER'] pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}" pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}/{file_stem}"
image_dir = f"{pdf_dir}/images" image_dir = f"{pdf_dir}/images"
t = threading.Thread(target=analysis_pdf_task, process = Process(target=analysis_pdf_task,
args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr, args=(pdf_dir, image_dir, file_path, analysis_task.is_ocr,
analysis_task.analysis_pdf_id)) analysis_task.analysis_pdf_id))
t.start() process.start()
# 生成文件的URL路径 # 生成文件的URL路径
file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False) file_url = url_for('analysis.uploadpdfview', filename=analysis_task.file_name, as_attachment=False)
file_name_split = analysis_task.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else analysis_task.file_name
data = { data = {
"url": file_url, "url": file_url,
"fileName": analysis_task.file_name, "fileName": new_file_name,
"id": analysis_task.id "id": analysis_task.id
} }
return generate_response(data=data) return generate_response(data=data)
......
...@@ -2,8 +2,8 @@ import os ...@@ -2,8 +2,8 @@ import os
task_state_map = { task_state_map = {
0: "running", 0: "running",
1: "finished", 1: "done",
2: "pending", 2: "pending"
} }
......
import json
from pathlib import Path
from flask import request, current_app
from flask_restful import Resource
from common.custom_response import generate_response
class MarkdownView(Resource):
def put(self):
"""
编辑markdown
"""
params = json.loads(request.data)
file_key = params.get('file_key')
data = params.get('data', {})
if not data:
return generate_response(code=400, msg="empty data", msgZH="数据为空,无法更新markdown")
pdf_analysis_folder = current_app.config['PDF_ANALYSIS_FOLDER']
pdf_dir = f"{current_app.static_folder}/{pdf_analysis_folder}"
markdown_file_dir = ""
for path_obj in Path(pdf_dir).iterdir():
if path_obj.name.startswith(file_key):
markdown_file_dir = path_obj
break
if markdown_file_dir and Path(markdown_file_dir).exists():
for k, v in data.items():
md_path = f"{markdown_file_dir}/{k}.md"
if Path(md_path).exists():
with open(md_path, 'w', encoding="utf-8") as f:
f.write(v)
full_content = ""
for path_obj in Path(markdown_file_dir).iterdir():
if path_obj.is_file() and path_obj.suffix == ".md" and path_obj.stem != "full":
with open(path_obj, 'r', encoding="utf-8") as f:
full_content += f.read() + "\n"
with open(f"{markdown_file_dir}/full.md", 'w', encoding="utf-8") as f:
f.write(full_content)
else:
return generate_response(code=400, msg="Invalid file_key", msgZH="文件哈希错误")
return generate_response()
...@@ -9,7 +9,7 @@ class AnalysisTask(db.Model): ...@@ -9,7 +9,7 @@ class AnalysisTask(db.Model):
file_name = db.Column(db.Text, comment="文件名称") file_name = db.Column(db.Text, comment="文件名称")
task_type = db.Column(db.String(128), comment="任务类型") task_type = db.Column(db.String(128), comment="任务类型")
is_ocr = db.Column(db.Boolean, default=False, comment="是否ocr") is_ocr = db.Column(db.Boolean, default=False, comment="是否ocr")
status = db.Column(db.Integer, default=0, comment="状态") # 0 running 1 finished 2 pending status = db.Column(db.Integer, default=0, comment="状态") # 0 running 1 done 2 pending
analysis_pdf_id = db.Column(db.Integer, comment="analysis_pdf的id") analysis_pdf_id = db.Column(db.Integer, comment="analysis_pdf的id")
create_date = db.Column(db.DateTime(), nullable=False, default=datetime.now) create_date = db.Column(db.DateTime(), nullable=False, default=datetime.now)
update_date = db.Column(db.DateTime(), nullable=False, default=datetime.now, onupdate=datetime.now) update_date = db.Column(db.DateTime(), nullable=False, default=datetime.now, onupdate=datetime.now)
......
...@@ -88,15 +88,16 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id): ...@@ -88,15 +88,16 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
img_name = Path(img).name img_name = Path(img).name
regex = re.compile(fr'.*\((.*?{img_name})') regex = re.compile(fr'.*\((.*?{img_name})')
regex_result = regex.search(md_content) regex_result = regex.search(md_content)
img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False) if regex_result:
md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}") img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False)
md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}")
full_md_content = "" full_md_content = ""
for item in json.loads(md_content): for item in json.loads(md_content):
full_md_content += item["md_content"] + "\n" full_md_content += item["md_content"] + "\n"
full_md_name = "full.md" full_md_name = "full.md"
with open(f"{pdf_dir}/{full_md_name}", "w") as file: with open(f"{pdf_dir}/{full_md_name}", "w", encoding="utf-8") as file:
file.write(full_md_content) file.write(full_md_content)
with app.app_context(): with app.app_context():
full_md_link = url_for('analysis.mdview', filename=full_md_name, as_attachment=False) full_md_link = url_for('analysis.mdview', filename=full_md_name, as_attachment=False)
...@@ -107,7 +108,7 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id): ...@@ -107,7 +108,7 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
for n, md in enumerate(json.loads(md_content)): for n, md in enumerate(json.loads(md_content)):
md_content = md["md_content"] md_content = md["md_content"]
md_name = f"{md.get('page_no', n)}.md" md_name = f"{md.get('page_no', n)}.md"
with open(f"{pdf_dir}/{md_name}", "w") as file: with open(f"{pdf_dir}/{md_name}", "w", encoding="utf-8") as file:
file.write(md_content) file.write(md_content)
md_url = url_for('analysis.mdview', filename=md_name, as_attachment=False) md_url = url_for('analysis.mdview', filename=md_name, as_attachment=False)
md_link_list.append(f"{md_url}&pdf={pdf_name}") md_link_list.append(f"{md_url}&pdf={pdf_name}")
......
...@@ -18,25 +18,33 @@ class TaskView(Resource): ...@@ -18,25 +18,33 @@ class TaskView(Resource):
analysis_task_pending = AnalysisTask.query.filter(AnalysisTask.status == 2).order_by( analysis_task_pending = AnalysisTask.query.filter(AnalysisTask.status == 2).order_by(
AnalysisTask.create_date.asc()).all() AnalysisTask.create_date.asc()).all()
pending_total = db.session.query(func.count(AnalysisTask.id)).filter(AnalysisTask.status == 2).scalar() pending_total = db.session.query(func.count(AnalysisTask.id)).filter(AnalysisTask.status == 2).scalar()
task_nums = pending_total + 1 if analysis_task_running:
data = [ task_nums = pending_total + 1
{ file_name_split = analysis_task_running.file_name.split("_")
"queues": task_nums, # 正在排队的任务总数 new_file_name = file_name_split[-1] if file_name_split else analysis_task_running.file_name
"rank": 1, data = [
"id": analysis_task_running.id, {
"url": url_for('analysis.uploadpdfview', filename=analysis_task_running.file_name, as_attachment=False), "queues": task_nums, # 正在排队的任务总数
"fileName": analysis_task_running.file_name, "rank": 1,
"type": analysis_task_running.task_type, "id": analysis_task_running.id,
"state": task_state_map.get(analysis_task_running.status), "url": url_for('analysis.uploadpdfview', filename=analysis_task_running.file_name, as_attachment=False),
} "fileName": new_file_name,
] "type": analysis_task_running.task_type,
"state": task_state_map.get(analysis_task_running.status),
}
]
else:
task_nums = pending_total
data = []
for n, task in enumerate(analysis_task_pending): for n, task in enumerate(analysis_task_pending):
file_name_split = task.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else task.file_name
data.append({ data.append({
"queues": task_nums, # 正在排队的任务总数 "queues": task_nums, # 正在排队的任务总数
"rank": n + 2, "rank": n + 2,
"id": task.id, "id": task.id,
"url": url_for('analysis.uploadpdfview', filename=task.file_name, as_attachment=False), "url": url_for('analysis.uploadpdfview', filename=task.file_name, as_attachment=False),
"fileName": task.file_name, "fileName": new_file_name,
"type": task.task_type, "type": task.task_type,
"state": task_state_map.get(task.status), "state": task_state_map.get(task.status),
}) })
...@@ -59,8 +67,10 @@ class HistoricalTasksView(Resource): ...@@ -59,8 +67,10 @@ class HistoricalTasksView(Resource):
error_out=False) error_out=False)
data = [] data = []
for n, task in enumerate(analysis_task): for n, task in enumerate(analysis_task):
file_name_split = task.file_name.split("_")
new_file_name = file_name_split[-1] if file_name_split else task.file_name
data.append({ data.append({
"fileName": task.file_name, "fileName": new_file_name,
"id": task.id, "id": task.id,
"type": task.task_type, "type": task.task_type,
"state": task_state_map.get(task.status), "state": task_state_map.get(task.status),
...@@ -75,14 +85,11 @@ class HistoricalTasksView(Resource): ...@@ -75,14 +85,11 @@ class HistoricalTasksView(Resource):
class DeleteTaskView(Resource): class DeleteTaskView(Resource):
def delete(self): def delete(self, id):
""" """
删除任务历史记录 删除任务历史记录
:return: :return:
""" """
params = json.loads(request.data)
id = params.get('id')
analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id, AnalysisTask.status != 0).first() analysis_task = AnalysisTask.query.filter(AnalysisTask.id == id, AnalysisTask.status != 0).first()
if analysis_task: if analysis_task:
analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == AnalysisTask.analysis_pdf_id).first() analysis_pdf = AnalysisPdf.query.filter(AnalysisPdf.id == AnalysisTask.analysis_pdf_id).first()
......
import json import json
import time
import traceback import traceback
import requests import requests
from flask import request, current_app, url_for, send_from_directory from flask import request, current_app, url_for, send_from_directory
...@@ -67,8 +68,7 @@ class UploadPdfView(Resource): ...@@ -67,8 +68,7 @@ class UploadPdfView(Resource):
upload_dir = f"{current_app.static_folder}/{pdf_upload_folder}" upload_dir = f"{current_app.static_folder}/{pdf_upload_folder}"
if not Path(upload_dir).exists(): if not Path(upload_dir).exists():
Path(upload_dir).mkdir(parents=True, exist_ok=True) Path(upload_dir).mkdir(parents=True, exist_ok=True)
file_key = calculate_file_hash(file) file_key = f"{calculate_file_hash(file)}{int(time.time())}"
# new_filename = f"{int(time.time())}_{filename}"
new_filename = f"{file_key}_{filename}" new_filename = f"{file_key}_{filename}"
file_path = f"{upload_dir}/{new_filename}" file_path = f"{upload_dir}/{new_filename}"
# file.save(file_path) # file.save(file_path)
......
...@@ -59,3 +59,4 @@ db = SQLAlchemy() ...@@ -59,3 +59,4 @@ db = SQLAlchemy()
migrate = Migrate() migrate = Migrate()
jwt = JWTManager() jwt = JWTManager()
ma = Marshmallow() ma = Marshmallow()
folder = app.config.get("REACT_APP_DIST")
from pathlib import Path
from flask import Blueprint
from ..extentions import app, Api
from .react_app_view import ReactAppView
from loguru import logger
folder = Path(app.config.get("REACT_APP_DIST", "../../web/dist/")).resolve()
logger.info(f"react_app folder: {folder}")
react_app_blue = Blueprint('react_app', __name__, static_folder=folder, static_url_path='', template_folder=folder)
react_app_api = Api(react_app_blue, prefix='')
react_app_api.add_resource(ReactAppView, '/')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment