Unverified Commit d39aa87e authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2834 from myhloli/dev

Dev
parents 3043f55e 359110e3
# Use the official sglang image
FROM lmsysorg/sglang:v0.4.7-cu124
FROM lmsysorg/sglang:v0.4.8-cu124
# install mineru latest
RUN python3 -m pip install -U 'mineru[core]' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages
......
# Use the official sglang image
FROM lmsysorg/sglang:v0.4.7-cu124
FROM lmsysorg/sglang:v0.4.8-cu124
# install mineru latest
RUN python3 -m pip install -U 'mineru[core]' --break-system-packages
......
......@@ -75,9 +75,9 @@ def doc_analyze(
):
"""
适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能,可能会增加显存使用量,
可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为100
可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为128
"""
min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 128))
# 收集所有页面信息
all_pages_info = [] # 存储(dataset_index, page_index, img, ocr, lang, width, height)
......
......@@ -25,6 +25,7 @@ class ModelSingleton:
backend: str,
model_path: str | None,
server_url: str | None,
**kwargs,
) -> BasePredictor:
key = (backend, model_path, server_url)
if key not in self._models:
......@@ -34,6 +35,7 @@ class ModelSingleton:
backend=backend,
model_path=model_path,
server_url=server_url,
**kwargs,
)
return self._models[key]
......@@ -75,15 +77,15 @@ async def aio_doc_analyze(
if predictor is None:
predictor = ModelSingleton().get_model(backend, model_path, server_url)
load_images_start = time.time()
# load_images_start = time.time()
images_list, pdf_doc = load_images_from_pdf(pdf_bytes)
images_base64_list = [image_dict["img_base64"] for image_dict in images_list]
load_images_time = round(time.time() - load_images_start, 2)
logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
# load_images_time = round(time.time() - load_images_start, 2)
# logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
infer_start = time.time()
# infer_start = time.time()
results = await predictor.aio_batch_predict(images=images_base64_list)
infer_time = round(time.time() - infer_start, 2)
logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
# infer_time = round(time.time() - infer_start, 2)
# logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)
return middle_json
return middle_json, results
......@@ -60,7 +60,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
'-l',
'--lang',
'lang',
type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']),
type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka',
'latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']),
help="""
Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
Without languages specified, 'ch' will be used by default.
......
......@@ -14,6 +14,7 @@ from mineru.utils.enum_class import MakeMode
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
pdf_suffixes = [".pdf"]
image_suffixes = [".png", ".jpeg", ".jpg"]
......@@ -73,155 +74,308 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
return output_bytes
def do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
p_lang_list: list[str],
backend="pipeline",
parse_method="auto",
p_formula_enable=True,
p_table_enable=True,
server_url=None,
f_draw_layout_bbox=True,
f_draw_span_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
start_page_id=0,
end_page_id=None,
def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id):
"""准备处理PDF字节数据"""
result = []
for pdf_bytes in pdf_bytes_list:
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
result.append(new_pdf_bytes)
return result
def _process_output(
pdf_info,
pdf_bytes,
pdf_file_name,
local_md_dir,
local_image_dir,
md_writer,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_orig_pdf,
f_dump_md,
f_dump_content_list,
f_dump_middle_json,
f_dump_model_output,
f_make_md_mode,
middle_json,
model_output=None,
is_pipeline=True
):
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
"""处理输出文件"""
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
image_dir = str(os.path.basename(local_image_dir))
if f_dump_md:
make_func = pipeline_union_make if is_pipeline else vlm_union_make
md_content_str = make_func(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
make_func = pipeline_union_make if is_pipeline else vlm_union_make
content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
if is_pipeline:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_output, ensure_ascii=False, indent=4),
)
else:
output_text = ("\n" + "-" * 50 + "\n").join(model_output)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
output_text,
)
logger.info(f"local output dir is {local_md_dir}")
def _process_pipeline(
output_dir,
pdf_file_names,
pdf_bytes_list,
p_lang_list,
parse_method,
p_formula_enable,
p_table_enable,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
):
"""处理pipeline后端逻辑"""
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = (
pipeline_doc_analyze(
pdf_bytes_list, p_lang_list, parse_method=parse_method,
formula_enable=p_formula_enable, table_enable=p_table_enable
)
)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(
model_list, images_list, pdf_doc, image_writer,
_lang, _ocr_enable, p_formula_enable
)
pdf_info = middle_json["pdf_info"]
pdf_bytes = pdf_bytes_list[idx]
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, model_json, is_pipeline=True
)
async def _async_process_vlm(
output_dir,
pdf_file_names,
pdf_bytes_list,
backend,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
server_url=None,
):
"""异步处理VLM后端逻辑"""
parse_method = "vlm"
f_draw_span_bbox = False
if not backend.endswith("client"):
server_url = None
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = await aio_vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url
)
pdf_info = middle_json["pdf_info"]
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
)
def _process_vlm(
output_dir,
pdf_file_names,
pdf_bytes_list,
backend,
f_draw_layout_bbox,
f_draw_span_bbox,
f_dump_md,
f_dump_middle_json,
f_dump_model_output,
f_dump_orig_pdf,
f_dump_content_list,
f_make_md_mode,
server_url=None,
):
"""同步处理VLM后端逻辑"""
parse_method = "vlm"
f_draw_span_bbox = False
if not backend.endswith("client"):
server_url = None
if backend == "pipeline":
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
for idx, pdf_bytes in enumerate(pdf_bytes_list):
new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
pdf_bytes_list[idx] = new_pdf_bytes
infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)
for idx, model_list in enumerate(infer_results):
model_json = copy.deepcopy(model_list)
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
images_list = all_image_lists[idx]
pdf_doc = all_pdf_docs[idx]
_lang = lang_list[idx]
_ocr_enable = ocr_enabled_list[idx]
middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)
pdf_info = middle_json["pdf_info"]
pdf_bytes = pdf_bytes_list[idx]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
middle_json, infer_result = vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
pdf_info = middle_json["pdf_info"]
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
_process_output(
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
f_make_md_mode, middle_json, infer_result, is_pipeline=False
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
if f_dump_model_output:
md_writer.write_string(
f"{pdf_file_name}_model.json",
json.dumps(model_json, ensure_ascii=False, indent=4),
)
def do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
p_lang_list: list[str],
backend="pipeline",
parse_method="auto",
p_formula_enable=True,
p_table_enable=True,
server_url=None,
f_draw_layout_bbox=True,
f_draw_span_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
start_page_id=0,
end_page_id=None,
):
# 预处理PDF字节数据
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
logger.info(f"local output dir is {local_md_dir}")
if backend == "pipeline":
_process_pipeline(
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list,
parse_method, p_formula_enable, p_table_enable,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode
)
else:
if backend.startswith("vlm-"):
backend = backend[4:]
f_draw_span_bbox = False
parse_method = "vlm"
for idx, pdf_bytes in enumerate(pdf_bytes_list):
pdf_file_name = pdf_file_names[idx]
pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)
pdf_info = middle_json["pdf_info"]
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")
if f_draw_span_bbox:
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")
if f_dump_orig_pdf:
md_writer.write(
f"{pdf_file_name}_origin.pdf",
pdf_bytes,
)
if f_dump_md:
image_dir = str(os.path.basename(local_image_dir))
md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
md_writer.write_string(
f"{pdf_file_name}.md",
md_content_str,
)
if f_dump_content_list:
image_dir = str(os.path.basename(local_image_dir))
content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
md_writer.write_string(
f"{pdf_file_name}_content_list.json",
json.dumps(content_list, ensure_ascii=False, indent=4),
)
if f_dump_middle_json:
md_writer.write_string(
f"{pdf_file_name}_middle.json",
json.dumps(middle_json, ensure_ascii=False, indent=4),
)
_process_vlm(
output_dir, pdf_file_names, pdf_bytes_list, backend,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
server_url
)
async def aio_do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_list: list[bytes],
p_lang_list: list[str],
backend="pipeline",
parse_method="auto",
p_formula_enable=True,
p_table_enable=True,
server_url=None,
f_draw_layout_bbox=True,
f_draw_span_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_output=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
start_page_id=0,
end_page_id=None,
):
# 预处理PDF字节数据
pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id)
if f_dump_model_output:
model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
md_writer.write_string(
f"{pdf_file_name}_model_output.txt",
model_output,
)
if backend == "pipeline":
# pipeline模式暂不支持异步,使用同步处理方式
_process_pipeline(
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list,
parse_method, p_formula_enable, p_table_enable,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode
)
else:
if backend.startswith("vlm-"):
backend = backend[4:]
logger.info(f"local output dir is {local_md_dir}")
await _async_process_vlm(
output_dir, pdf_file_names, pdf_bytes_list, backend,
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
server_url
)
......
import uuid
import os
from base64 import b64encode
import uvicorn
import argparse
from pathlib import Path
from glob import glob
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from typing import List, Optional
from loguru import logger
from mineru.cli.common import aio_do_parse, read_fn
from mineru.version import __version__
app = FastAPI()
def encode_image(image_path: str) -> str:
"""Encode image using base64"""
with open(image_path, "rb") as f:
return b64encode(f.read()).decode()
@app.post(path="/file_parse",)
async def parse_pdf(
files: List[UploadFile] = File(...),
output_dir: str = Form("./output"),
lang_list: List[str] = Form(["ch"]),
backend: str = Form("pipeline"),
parse_method: str = Form("auto"),
formula_enable: bool = Form(True),
table_enable: bool = Form(True),
server_url: Optional[str] = Form(None),
reuturn_md: bool = Form(True),
reuturn_middle_json: bool = Form(False),
return_model_output: bool = Form(False),
reuturn_content_list: bool = Form(False),
return_images: bool = Form(False),
start_page_id: int = Form(0),
end_page_id: int = Form(99999),
):
try:
# 创建唯一的输出目录
unique_dir = os.path.join(output_dir, str(uuid.uuid4()))
os.makedirs(unique_dir, exist_ok=True)
# 处理上传的PDF文件
pdf_file_names = []
pdf_bytes_list = []
for file in files:
content = await file.read()
file_path = Path(file.filename)
# 如果是图像文件或PDF,使用read_fn处理
if file_path.suffix.lower() in [".pdf", ".png", ".jpeg", ".jpg"]:
# 创建临时文件以便使用read_fn
temp_path = Path(unique_dir) / file_path.name
with open(temp_path, "wb") as f:
f.write(content)
try:
pdf_bytes = read_fn(temp_path)
pdf_bytes_list.append(pdf_bytes)
pdf_file_names.append(file_path.stem)
os.remove(temp_path) # 删除临时文件
except Exception as e:
return JSONResponse(
status_code=400,
content={"error": f"处理文件失败: {str(e)}"}
)
else:
return JSONResponse(
status_code=400,
content={"error": f"不支持的文件类型: {file_path.suffix}"}
)
# 设置语言列表,确保与文件数量一致
actual_lang_list = lang_list
if len(actual_lang_list) != len(pdf_file_names):
# 如果语言列表长度不匹配,使用第一个语言或默认"ch"
actual_lang_list = [actual_lang_list[0] if actual_lang_list else "ch"] * len(pdf_file_names)
# 调用异步处理函数
await aio_do_parse(
output_dir=unique_dir,
pdf_file_names=pdf_file_names,
pdf_bytes_list=pdf_bytes_list,
p_lang_list=actual_lang_list,
backend=backend,
parse_method=parse_method,
p_formula_enable=formula_enable,
p_table_enable=table_enable,
server_url=server_url,
f_draw_layout_bbox=False,
f_draw_span_bbox=False,
f_dump_md=reuturn_md,
f_dump_middle_json=reuturn_middle_json,
f_dump_model_output=return_model_output,
f_dump_orig_pdf=False,
f_dump_content_list=reuturn_content_list,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
# 构建结果路径
result_dict = {}
for pdf_name in pdf_file_names:
result_dict[pdf_name] = {}
data = result_dict[pdf_name]
if backend.startswith("pipeline"):
parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
else:
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
def get_infer_result(file_suffix_identifier: str):
"""从结果文件中读取推理结果"""
result_file_path = os.path.join(parse_dir, f"{pdf_name}{file_suffix_identifier}")
if os.path.exists(result_file_path):
with open(result_file_path, "r", encoding="utf-8") as fp:
return fp.read()
return None
if os.path.exists(parse_dir):
if reuturn_md:
data["md_content"] = get_infer_result(".md")
if reuturn_middle_json:
data["middle_json"] = get_infer_result("_middle.json")
if return_model_output:
if backend.startswith("pipeline"):
data["model_output"] = get_infer_result("_model.json")
else:
data["model_output"] = get_infer_result("_model_output.txt")
if reuturn_content_list:
data["content_list"] = get_infer_result("_content_list.json")
if return_images:
image_paths = glob(f"{parse_dir}/images/*.jpg")
data["images"] = {
os.path.basename(
image_path
): f"data:image/jpeg;base64,{encode_image(image_path)}"
for image_path in image_paths
}
return JSONResponse(
status_code=200,
content={
"backend": backend,
"version": __version__,
"results": result_dict
}
)
except Exception as e:
logger.exception(e)
return JSONResponse(
status_code=500,
content={"error": str(e)}
)
def main():
"""启动MinerU FastAPI服务器的命令行入口"""
parser = argparse.ArgumentParser(description='Start MinerU FastAPI Service')
parser.add_argument('--host', type=str, default='127.0.0.1', help='Server host (default: 127.0.0.1)')
parser.add_argument('--port', type=int, default=8000, help='Server port (default: 8000)')
parser.add_argument('--reload', action='store_true', help='Enable auto-reload (development mode)')
args = parser.parse_args()
print(f"Start MinerU FastAPI Service: http://{args.host}:{args.port}")
print("The API documentation can be accessed at the following address:")
print(f"- Swagger UI: http://{args.host}:{args.port}/docs")
print(f"- ReDoc: http://{args.host}:{args.port}/redoc")
uvicorn.run(
"mineru.cli.fast_api:app",
host=args.host,
port=args.port,
reload=args.reload
)
if __name__ == "__main__":
main()
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
import base64
import os
import re
import time
import zipfile
from pathlib import Path
import gradio as gr
from gradio_pdf import PDF
from loguru import logger
from mineru.cli.common import prepare_env, read_fn, aio_do_parse
from mineru.utils.hash_utils import str_sha256
async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language, backend, url):
os.makedirs(output_dir, exist_ok=True)
try:
file_name = f'{safe_stem(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}'
pdf_data = read_fn(doc_path)
if is_ocr:
parse_method = 'ocr'
else:
parse_method = 'auto'
if backend.startswith("vlm"):
parse_method = "vlm"
if not backend.endswith("client"):
url = None
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
await aio_do_parse(
output_dir=output_dir,
pdf_file_names=[file_name],
pdf_bytes_list=[pdf_data],
p_lang_list=[language],
parse_method=parse_method,
end_page_id=end_page_id,
p_formula_enable=formula_enable,
p_table_enable=table_enable,
backend=backend,
server_url=url,
)
return local_md_dir, file_name
except Exception as e:
logger.exception(e)
return None
def compress_directory_to_zip(directory_path, output_zip_path):
"""压缩指定目录到一个 ZIP 文件。
:param directory_path: 要压缩的目录路径
:param output_zip_path: 输出的 ZIP 文件路径
"""
try:
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
# 遍历目录中的所有文件和子目录
for root, dirs, files in os.walk(directory_path):
for file in files:
# 构建完整的文件路径
file_path = os.path.join(root, file)
# 计算相对路径
arcname = os.path.relpath(file_path, directory_path)
# 添加文件到 ZIP 文件
zipf.write(file_path, arcname)
return 0
except Exception as e:
logger.exception(e)
return -1
def image_to_base64(image_path):
with open(image_path, 'rb') as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def replace_image_with_base64(markdown_text, image_dir_path):
# 匹配Markdown中的图片标签
pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
# 替换图片链接
def replace(match):
relative_path = match.group(1)
full_path = os.path.join(image_dir_path, relative_path)
base64_image = image_to_base64(full_path)
return f'![{relative_path}](data:image/jpeg;base64,{base64_image})'
# 应用替换
return re.sub(pattern, replace, markdown_text)
async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None):
file_path = to_pdf(file_path)
# 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url)
archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip')
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0:
logger.info('压缩成功')
else:
logger.error('压缩失败')
md_path = os.path.join(local_md_dir, file_name + '.md')
with open(md_path, 'r', encoding='utf-8') as f:
txt_content = f.read()
md_content = replace_image_with_base64(txt_content, local_md_dir)
# 返回转换后的PDF路径
new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf')
return md_content, txt_content, archive_zip_path, new_pdf_path
latex_delimiters = [
{'left': '$$', 'right': '$$', 'display': True},
{'left': '$', 'right': '$', 'display': False},
{'left': '\\(', 'right': '\\)', 'display': False},
{'left': '\\[', 'right': '\\]', 'display': True},
]
header_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'resources', 'header.html')
with open(header_path, 'r') as file:
header = file.read()
latin_lang = [
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'rs_cyrillic', 'bg', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
east_slavic_lang = ["ru", "be", "uk"]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
'sa', 'bgc'
]
other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']
# all_lang = ['', 'auto']
all_lang = []
# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
all_lang.extend([*other_lang, *add_lang])
def safe_stem(file_path):
stem = Path(file_path).stem
# 只保留字母、数字、下划线和点,其他字符替换为下划线
return re.sub(r'[^\w.]', '_', stem)
def to_pdf(file_path):
if file_path is None:
return None
pdf_bytes = read_fn(file_path)
# unique_filename = f'{uuid.uuid4()}.pdf'
unique_filename = f'{safe_stem(file_path)}.pdf'
# 构建完整的文件路径
tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
# 将字节数据写入文件
with open(tmp_file_path, 'wb') as tmp_pdf_file:
tmp_pdf_file.write(pdf_bytes)
return tmp_file_path
def main():
example_enable = False
# try:
# print("Start init SgLang engine...")
# from mineru.backend.vlm.vlm_analyze import ModelSingleton
# modelsingleton = ModelSingleton()
# predictor = modelsingleton.get_model(
# "sglang-engine",
# None,
# None,
# mem_fraction_static=0.5,
# enable_torch_compile=True,
# )
# print("SgLang engine init successfully.")
# except Exception as e:
# logger.exception(e)
with gr.Blocks() as demo:
gr.HTML(header)
with gr.Row():
with gr.Column(variant='panel', scale=5):
with gr.Row():
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'])
with gr.Row():
max_pages = gr.Slider(1, 20, 10, step=1, label='Max convert pages')
with gr.Row():
backend = gr.Dropdown(["pipeline", "vlm-transformers", "vlm-sglang-client"], label="Backend", value="pipeline")
with gr.Row(visible=True) as ocr_options:
language = gr.Dropdown(all_lang, label='Language', value='ch')
with gr.Row(visible=False) as client_options:
url = gr.Textbox(label='Server URL', value='http://localhost:30000', placeholder='http://localhost:30000')
with gr.Row(visible=True) as pipeline_options:
is_ocr = gr.Checkbox(label='Force enable OCR', value=False)
formula_enable = gr.Checkbox(label='Enable formula recognition', value=True)
table_enable = gr.Checkbox(label='Enable table recognition(test)', value=True)
with gr.Row():
change_bu = gr.Button('Convert')
clear_bu = gr.ClearButton(value='Clear')
pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800)
if example_enable:
example_root = os.path.join(os.path.dirname(__file__), 'examples')
if os.path.exists(example_root):
with gr.Accordion('Examples:'):
gr.Examples(
examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
_.endswith('pdf')],
inputs=file
)
with gr.Column(variant='panel', scale=5):
output_file = gr.File(label='convert result', interactive=False)
with gr.Tabs():
with gr.Tab('Markdown rendering'):
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
latex_delimiters=latex_delimiters,
line_breaks=True)
with gr.Tab('Markdown text'):
md_text = gr.TextArea(lines=45, show_copy_button=True)
# 更新界面函数
def update_interface(backend_choice):
if backend_choice in ["vlm-transformers", "vlm-sglang-engine"]:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif backend_choice in ["vlm-sglang-client"]: # pipeline
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
elif backend_choice in ["pipeline"]:
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
else:
pass
# 添加事件处理
backend.change(
fn=update_interface,
inputs=[backend],
outputs=[client_options, ocr_options, pipeline_options]
)
file.change(fn=to_pdf, inputs=file, outputs=pdf_show)
change_bu.click(fn=to_markdown, inputs=[file, max_pages, is_ocr, formula_enable, table_enable, language, backend, url],
outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([file, md, pdf_show, md_text, output_file, is_ocr])
demo.launch(server_name='localhost')
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -3,6 +3,7 @@ import os
import sys
import click
import requests
from loguru import logger
from mineru.utils.enum_class import ModelPath
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
......@@ -54,7 +55,32 @@ def configure_model(model_dir, model_type):
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been successfully configured, the path is: {config_file}')
logger.info(f'The configuration file has been successfully configured, the path is: {config_file}')
def download_pipeline_models():
"""下载Pipeline模型"""
model_paths = [
ModelPath.doclayout_yolo,
ModelPath.yolo_v8_mfd,
ModelPath.unimernet_small,
ModelPath.pytorch_paddle,
ModelPath.layout_reader,
ModelPath.slanet_plus
]
download_finish_path = ""
for model_path in model_paths:
logger.info(f"Downloading model: {model_path}")
download_finish_path = auto_download_and_get_model_root_path(model_path, repo_mode='pipeline')
logger.info(f"Pipeline models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, "pipeline")
def download_vlm_models():
"""下载VLM模型"""
download_finish_path = auto_download_and_get_model_root_path("/", repo_mode='vlm')
logger.info(f"VLM models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, "vlm")
@click.command()
......@@ -102,30 +128,7 @@ def download_models(model_source, model_type):
default='all'
)
click.echo(f"Downloading {model_type} model from {os.getenv('MINERU_MODEL_SOURCE', None)}...")
def download_pipeline_models():
"""下载Pipeline模型"""
model_paths = [
ModelPath.doclayout_yolo,
ModelPath.yolo_v8_mfd,
ModelPath.unimernet_small,
ModelPath.pytorch_paddle,
ModelPath.layout_reader,
ModelPath.slanet_plus
]
download_finish_path = ""
for model_path in model_paths:
click.echo(f"Downloading model: {model_path}")
download_finish_path = auto_download_and_get_model_root_path(model_path, repo_mode='pipeline')
click.echo(f"Pipeline models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, "pipeline")
def download_vlm_models():
"""下载VLM模型"""
download_finish_path = auto_download_and_get_model_root_path("/", repo_mode='vlm')
click.echo(f"VLM models downloaded successfully to: {download_finish_path}")
configure_model(download_finish_path, "vlm")
logger.info(f"Downloading {model_type} model from {os.getenv('MINERU_MODEL_SOURCE', None)}...")
try:
if model_type == 'pipeline':
......@@ -140,7 +143,7 @@ def download_models(model_source, model_type):
sys.exit(1)
except Exception as e:
click.echo(f"Download failed: {str(e)}", err=True)
logger.exception(f"An error occurred while downloading models: {str(e)}")
sys.exit(1)
if __name__ == '__main__':
......
......@@ -26,9 +26,10 @@ latin_lang = [
]
arabic_lang = ['ar', 'fa', 'ug', 'ur']
cyrillic_lang = [
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
'rs_cyrillic', 'bg', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
]
east_slavic_lang = ["ru", "be", "uk"]
devanagari_lang = [
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126
'sa', 'bgc'
......@@ -69,6 +70,8 @@ class PytorchPaddleOCR(TextSystem):
self.lang = 'cyrillic'
elif self.lang in devanagari_lang:
self.lang = 'devanagari'
elif self.lang in east_slavic_lang:
self.lang = 'east_slavic'
else:
pass
......
......@@ -490,3 +490,82 @@ devanagari_PP-OCRv3_rec_infer:
# out_channels: 169
fc_decay: 0.00001
korean_PP-OCRv5_rec_infer:
model_type: rec
algorithm: SVTR_HGNet
Transform:
Backbone:
name: PPLCNetV3
scale: 0.95
Head:
name: MultiHead
out_channels_list:
CTCLabelDecode: 11947
head_list:
- CTCHead:
Neck:
name: svtr
dims: 120
depth: 2
hidden_dims: 120
kernel_size: [ 1, 3 ]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 384
max_text_length: 25
latin_PP-OCRv5_rec_infer:
model_type: rec
algorithm: SVTR_HGNet
Transform:
Backbone:
name: PPLCNetV3
scale: 0.95
Head:
name: MultiHead
out_channels_list:
CTCLabelDecode: 504
head_list:
- CTCHead:
Neck:
name: svtr
dims: 120
depth: 2
hidden_dims: 120
kernel_size: [ 1, 3 ]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 384
max_text_length: 25
eslav_PP-OCRv5_rec_infer:
model_type: rec
algorithm: SVTR_HGNet
Transform:
Backbone:
name: PPLCNetV3
scale: 0.95
Head:
name: MultiHead
out_channels_list:
CTCLabelDecode: 519
head_list:
- CTCHead:
Neck:
name: svtr
dims: 120
depth: 2
hidden_dims: 120
kernel_size: [ 1, 3 ]
use_guide: True
Head:
fc_decay: 0.00001
- NRTRHead:
nrtr_dim: 384
max_text_length: 25
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
]
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
©
{
}
\
|
@
^
~
÷
·
±
®
Ω
¢
£
¥
𝑢
𝜓
ƒ
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
Ø
Ù
Ú
Û
Ü
Ý
Þ
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
ø
ù
ú
û
ü
ý
þ
ÿ
¡
¤
¦
§
¨
ª
«
¬
¯
°
²
³
´
µ
¸
¹
º
»
¼
½
¾
¿
×
Α
α
Β
β
Γ
γ
Δ
δ
Ε
ε
Ζ
ζ
Η
η
Θ
θ
Ι
ι
Κ
κ
Λ
λ
Μ
μ
Ν
ν
Ξ
ξ
Ο
ο
Π
π
Ρ
ρ
Σ
σ
ς
Τ
τ
Υ
υ
Φ
φ
Χ
χ
Ψ
ψ
ω
А
Б
В
Г
Ґ
Д
Е
Ё
Є
Ж
З
И
І
Ї
Й
К
Л
М
Н
О
П
Р
С
Т
У
Ў
Ф
Х
Ц
Ч
Ш
Щ
Ъ
Ы
Ь
Э
Ю
Я
а
б
в
г
ґ
д
е
ё
є
ж
з
и
і
ї
й
к
л
м
н
о
п
р
с
т
у
ў
ф
х
ц
ч
ш
щ
ъ
ы
ь
э
ю
я
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
;
<
=
>
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
[
\
]
^
_
`
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
{
|
}
~
¡
¢
£
¤
¥
¦
§
¨
©
ª
«
¬
­
®
¯
°
±
²
³
´
µ
·
¸
¹
º
»
¼
½
¾
¿
À
Á
Â
Ã
Ä
Å
Æ
Ç
È
É
Ê
Ë
Ì
Í
Î
Ï
Ð
Ñ
Ò
Ó
Ô
Õ
Ö
×
Ø
Ù
Ú
Û
Ü
Ý
Þ
ß
à
á
â
ã
ä
å
æ
ç
è
é
ê
ë
ì
í
î
ï
ð
ñ
ò
ó
ô
õ
ö
÷
ø
ù
ú
û
ü
ý
þ
ÿ
Ą
ą
Ć
ć
Č
č
Ď
ď
Đ
đ
Ė
ė
Ę
ę
Ě
ě
Ğ
ğ
Į
į
İ
ı
Ĺ
ĺ
Ľ
ľ
Ł
ł
Ń
ń
Ň
ň
ō
Ő
ő
Œ
œ
Ŕ
ŕ
Ř
ř
Ś
ś
Ş
ş
Š
š
Ť
ť
Ū
ū
Ů
ů
Ű
ű
Ų
ų
Ÿ
Ź
ź
Ż
ż
Ž
ž
ƒ
ʒ
Ω
α
β
γ
δ
ε
ζ
η
θ
ι
κ
λ
μ
ν
ξ
ο
π
ρ
ς
σ
τ
υ
φ
χ
ψ
ω
з
𝑢
𝜓
......@@ -24,17 +24,17 @@ lang:
rec: en_PP-OCRv4_rec_infer.pth
dict: en_dict.txt
korean:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: korean_PP-OCRv3_rec_infer.pth
dict: korean_dict.txt
det: ch_PP-OCRv5_det_infer.pth
rec: korean_PP-OCRv5_rec_infer.pth
dict: ppocrv5_korean_dict.txt
japan:
det: ch_PP-OCRv5_det_infer.pth
rec: ch_PP-OCRv5_rec_server_infer.pth
dict: japan_dict.txt
dict: ppocrv5_dict.txt
chinese_cht:
det: ch_PP-OCRv5_det_infer.pth
rec: ch_PP-OCRv5_rec_server_infer.pth
dict: chinese_cht_dict.txt
dict: ppocrv5_dict.txt
ta:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ta_PP-OCRv3_rec_infer.pth
......@@ -48,9 +48,9 @@ lang:
rec: ka_PP-OCRv3_rec_infer.pth
dict: ka_dict.txt
latin:
det: en_PP-OCRv3_det_infer.pth
rec: latin_PP-OCRv3_rec_infer.pth
dict: latin_dict.txt
det: ch_PP-OCRv5_det_infer.pth
rec: latin_PP-OCRv5_rec_infer.pth
dict: ppocrv5_latin_dict.txt
arabic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: arabic_PP-OCRv3_rec_infer.pth
......@@ -62,4 +62,8 @@ lang:
devanagari:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: devanagari_PP-OCRv3_rec_infer.pth
dict: devanagari_dict.txt
\ No newline at end of file
dict: devanagari_dict.txt
east_slavic:
det: ch_PP-OCRv5_det_infer.pth
rec: eslav_PP-OCRv5_rec_infer.pth
dict: ppocrv5_eslav_dict.txt
\ No newline at end of file
<html><head>
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
<style>
.link-block {
border: 1px solid transparent;
border-radius: 24px;
background-color: rgba(54, 54, 54, 1);
cursor: pointer !important;
}
.link-block:hover {
background-color: rgba(54, 54, 54, 0.75) !important;
cursor: pointer !important;
}
.external-link {
display: inline-flex;
align-items: center;
height: 36px;
line-height: 36px;
padding: 0 16px;
cursor: pointer !important;
}
.external-link,
.external-link:hover {
cursor: pointer !important;
}
a {
text-decoration: none;
}
</style></head>
<body>
<div style="
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
text-align: center;
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
padding: 24px;
gap: 24px;
border-radius: 8px;
">
<div style="
display: flex;
flex-direction: column;
align-items: center;
gap: 16px;
">
<div style="display: flex; flex-direction: column; gap: 8px">
<h1 style="
font-size: 48px;
color: #fafafa;
margin: 0;
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
">
MinerU 2: PDF Extraction Demo
</h1>
</div>
</div>
<p style="
margin: 0;
line-height: 1.6rem;
font-size: 16px;
color: #fafafa;
opacity: 0.8;
">
A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.<br>
</p>
<style>
.link-block {
display: inline-block;
}
.link-block + .link-block {
margin-left: 20px;
}
</style>
<div class="column has-text-centered">
<div class="publication-links">
<!-- Code Link. -->
<span class="link-block">
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 4px">
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
</span>
<span style="color: white">Code</span>
</a>
</span>
<!-- arXiv Link. -->
<span class="link-block">
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 8px">
<i class="fas fa-file" style="color: white"></i>
</span>
<span style="color: white">Paper</span>
</a>
</span>
<!-- Homepage Link. -->
<span class="link-block">
<a href="https://mineru.net/home?source=online" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 8px">
<i class="fas fa-home" style="color: white"></i>
</span>
<span style="color: white">Homepage</span>
</a>
</span>
<!-- Client Link. -->
<span class="link-block">
<a href="https://mineru.net/client?source=online" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 8px">
<i class="fas fa-download" style="color: white"></i>
</span>
<span style="color: white">Download</span>
</a>
</span>
</div>
</div>
<!-- New Demo Links -->
</div>
</body></html>
\ No newline at end of file
......@@ -43,7 +43,7 @@ vlm = [
"pydantic",
]
sglang = [
"sglang[all]==0.4.7",
"sglang[all]>=0.4.7,<0.4.9",
]
pipeline = [
"matplotlib>=3.10,<4",
......@@ -62,9 +62,20 @@ pipeline = [
"transformers>=4.49.0,!=4.51.0,<5.0.0",
"fast-langdetect>=0.2.3,<0.3.0",
]
api = [
"fastapi",
"python-multipart",
"uvicorn",
]
gradio = [
"gradio>=5.34,<6",
"gradio-pdf>=0.0.22",
]
core = [
"mineru[vlm]",
"mineru[pipeline]",
"mineru[api]",
"mineru[gradio]",
]
all = [
"mineru[core]",
......@@ -97,6 +108,8 @@ Repository = "https://github.com/opendatalab/MinerU"
mineru = "mineru.cli:client.main"
mineru-sglang-server = "mineru.cli.vlm_sglang_server:main"
mineru-models-download = "mineru.cli.models_download:download_models"
mineru-api = "mineru.cli.fast_api:main"
mineru-gradio = "mineru.cli.gradio_app:main"
[tool.setuptools.dynamic]
version = {attr = "mineru.version.__version__"}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment