Unverified Commit 41d96cd8 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2065 from opendatalab/release-1.3.0

Release 1.3.0
parents c3d43e52 dd96663c
import os
import math
from pathlib import Path
import numpy as np
import cv2
import argparse
root_dir = Path(__file__).resolve().parent.parent.parent
DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml"
def init_args():
def str2bool(v):
return v.lower() in ("true", "t", "1")
parser = argparse.ArgumentParser()
# params for prediction engine
parser.add_argument("--use_gpu", type=str2bool, default=False)
parser.add_argument("--det", type=str2bool, default=True)
parser.add_argument("--rec", type=str2bool, default=True)
parser.add_argument("--device", type=str, default='cpu')
# parser.add_argument("--ir_optim", type=str2bool, default=True)
# parser.add_argument("--use_tensorrt", type=str2bool, default=False)
# parser.add_argument("--use_fp16", type=str2bool, default=False)
parser.add_argument("--gpu_mem", type=int, default=500)
parser.add_argument("--warmup", type=str2bool, default=False)
# params for text detector
parser.add_argument("--image_dir", type=str)
parser.add_argument("--det_algorithm", type=str, default='DB')
parser.add_argument("--det_model_path", type=str)
parser.add_argument("--det_limit_side_len", type=float, default=960)
parser.add_argument("--det_limit_type", type=str, default='max')
# DB parmas
parser.add_argument("--det_db_thresh", type=float, default=0.3)
parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
parser.add_argument("--max_batch_size", type=int, default=10)
parser.add_argument("--use_dilation", type=str2bool, default=False)
parser.add_argument("--det_db_score_mode", type=str, default="fast")
# EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
# SAST parmas
parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
parser.add_argument("--det_sast_polygon", type=str2bool, default=False)
# PSE parmas
parser.add_argument("--det_pse_thresh", type=float, default=0)
parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
parser.add_argument("--det_pse_min_area", type=float, default=16)
parser.add_argument("--det_pse_box_type", type=str, default='box')
parser.add_argument("--det_pse_scale", type=int, default=1)
# FCE parmas
parser.add_argument("--scales", type=list, default=[8, 16, 32])
parser.add_argument("--alpha", type=float, default=1.0)
parser.add_argument("--beta", type=float, default=1.0)
parser.add_argument("--fourier_degree", type=int, default=5)
parser.add_argument("--det_fce_box_type", type=str, default='poly')
# params for text recognizer
parser.add_argument("--rec_algorithm", type=str, default='CRNN')
parser.add_argument("--rec_model_path", type=str)
parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
parser.add_argument("--rec_char_type", type=str, default='ch')
parser.add_argument("--rec_batch_num", type=int, default=6)
parser.add_argument("--max_text_length", type=int, default=25)
parser.add_argument("--use_space_char", type=str2bool, default=True)
parser.add_argument("--drop_score", type=float, default=0.5)
parser.add_argument("--limited_max_width", type=int, default=1280)
parser.add_argument("--limited_min_width", type=int, default=16)
parser.add_argument(
"--vis_font_path", type=str,
default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf'))
parser.add_argument(
"--rec_char_dict_path",
type=str,
default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
'pytorchocr/utils/ppocr_keys_v1.txt'))
# params for text classifier
parser.add_argument("--use_angle_cls", type=str2bool, default=False)
parser.add_argument("--cls_model_path", type=str)
parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
parser.add_argument("--label_list", type=list, default=['0', '180'])
parser.add_argument("--cls_batch_num", type=int, default=6)
parser.add_argument("--cls_thresh", type=float, default=0.9)
parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
parser.add_argument("--use_pdserving", type=str2bool, default=False)
# params for e2e
parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
parser.add_argument("--e2e_model_path", type=str)
parser.add_argument("--e2e_limit_side_len", type=float, default=768)
parser.add_argument("--e2e_limit_type", type=str, default='max')
# PGNet parmas
parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
parser.add_argument(
"--e2e_char_dict_path", type=str,
default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
'pytorchocr/utils/ic15_dict.txt'))
parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True)
parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
# SR parmas
parser.add_argument("--sr_model_path", type=str)
parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
parser.add_argument("--sr_batch_num", type=int, default=1)
# params .yaml
parser.add_argument("--det_yaml_path", type=str, default=None)
parser.add_argument("--rec_yaml_path", type=str, default=None)
parser.add_argument("--cls_yaml_path", type=str, default=None)
parser.add_argument("--e2e_yaml_path", type=str, default=None)
parser.add_argument("--sr_yaml_path", type=str, default=None)
# multi-process
parser.add_argument("--use_mp", type=str2bool, default=False)
parser.add_argument("--total_process_num", type=int, default=1)
parser.add_argument("--process_id", type=int, default=0)
parser.add_argument("--benchmark", type=str2bool, default=False)
parser.add_argument("--save_log_path", type=str, default="./log_output/")
parser.add_argument("--show_log", type=str2bool, default=True)
return parser
def parse_args():
parser = init_args()
return parser.parse_args()
def get_default_config(args):
return vars(args)
def read_network_config_from_yaml(yaml_path, char_num=None):
if not os.path.exists(yaml_path):
raise FileNotFoundError('{} is not existed.'.format(yaml_path))
import yaml
with open(yaml_path, encoding='utf-8') as f:
res = yaml.safe_load(f)
if res.get('Architecture') is None:
raise ValueError('{} has no Architecture'.format(yaml_path))
if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None:
res['Architecture']['Head']['out_channels_list'] = {
'CTCLabelDecode': char_num,
'SARLabelDecode': char_num + 2,
'NRTRLabelDecode': char_num + 3
}
return res['Architecture']
def AnalysisConfig(weights_path, yaml_path=None, char_num=None):
if not os.path.exists(os.path.abspath(weights_path)):
raise FileNotFoundError('{} is not found.'.format(weights_path))
if yaml_path is not None:
return read_network_config_from_yaml(yaml_path, char_num=char_num)
def resize_img(img, input_size=600):
"""
resize img and limit the longest side of the image to input_size
"""
img = np.array(img)
im_shape = img.shape
im_size_max = np.max(im_shape[0:2])
im_scale = float(input_size) / float(im_size_max)
img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
return img
def str_count(s):
"""
Count the number of Chinese characters,
a single English character and a single number
equal to half the length of Chinese characters.
args:
s(string): the input of string
return(int):
the number of Chinese characters
"""
import string
count_zh = count_pu = 0
s_len = len(s)
en_dg_count = 0
for c in s:
if c in string.ascii_letters or c.isdigit() or c.isspace():
en_dg_count += 1
elif c.isalpha():
count_zh += 1
else:
count_pu += 1
return s_len - math.ceil(en_dg_count / 2)
def base64_to_cv2(b64str):
import base64
data = base64.b64decode(b64str.encode('utf8'))
data = np.fromstring(data, np.uint8)
data = cv2.imdecode(data, cv2.IMREAD_COLOR)
return data
def get_arch_config(model_path):
from omegaconf import OmegaConf
all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH)
path = Path(model_path)
file_name = path.stem
if file_name not in all_arch_config:
raise ValueError(f"architecture {file_name} is not in arch_config.yaml")
arch_config = all_arch_config[file_name]
return arch_config
\ No newline at end of file
......@@ -9,7 +9,7 @@ from magic_pdf.libs.config_reader import get_device
class RapidTableModel(object):
def __init__(self, ocr_engine, table_sub_model_name):
def __init__(self, ocr_engine, table_sub_model_name='slanet_plus'):
sub_model_list = [model.value for model in ModelType]
if table_sub_model_name is None:
input_args = RapidTableInput()
......@@ -23,25 +23,17 @@ class RapidTableModel(object):
self.table_model = RapidTable(input_args)
# if ocr_engine is None:
# self.ocr_model_name = "RapidOCR"
# if torch.cuda.is_available():
# from rapidocr_paddle import RapidOCR
# self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
# else:
# from rapidocr_onnxruntime import RapidOCR
# self.ocr_engine = RapidOCR()
# self.ocr_model_name = "RapidOCR"
# if torch.cuda.is_available():
# from rapidocr_paddle import RapidOCR
# self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
# else:
# self.ocr_model_name = "PaddleOCR"
# self.ocr_engine = ocr_engine
# from rapidocr_onnxruntime import RapidOCR
# self.ocr_engine = RapidOCR()
self.ocr_model_name = "PaddleOCR"
self.ocr_engine = ocr_engine
self.ocr_model_name = "RapidOCR"
if torch.cuda.is_available():
from rapidocr_paddle import RapidOCR
self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
else:
from rapidocr_onnxruntime import RapidOCR
self.ocr_engine = RapidOCR()
def predict(self, image):
......
import torch
from struct_eqtable import build_model
from magic_pdf.model.sub_modules.table.table_utils import minify_html
class StructTableModel:
def __init__(self, model_path, max_new_tokens=1024, max_time=60):
# init
assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
self.model = build_model(
model_ckpt=model_path,
max_new_tokens=max_new_tokens,
max_time=max_time,
lmdeploy=False,
flash_attn=False,
batch_size=1,
).cuda()
self.default_format = "html"
def predict(self, images, output_format=None, **kwargs):
if output_format is None:
output_format = self.default_format
else:
if output_format not in ['latex', 'markdown', 'html']:
raise ValueError(f"Output format {output_format} is not supported.")
results = self.model(
images, output_format=output_format
)
if output_format == "html":
results = [minify_html(html) for html in results]
return results
import os
import cv2
import numpy as np
from ppstructure.table.predict_table import TableSystem
from ppstructure.utility import init_args
from PIL import Image
from magic_pdf.config.constants import * # noqa: F403
class TableMasterPaddleModel(object):
"""This class is responsible for converting image of table into HTML format
using a pre-trained model.
Attributes:
- table_sys: An instance of TableSystem initialized with parsed arguments.
Methods:
- __init__(config): Initializes the model with configuration parameters.
- img2html(image): Converts a PIL Image or NumPy array to HTML string.
- parse_args(**kwargs): Parses configuration arguments.
"""
def __init__(self, config):
"""
Parameters:
- config (dict): Configuration dictionary containing model_dir and device.
"""
args = self.parse_args(**config)
self.table_sys = TableSystem(args)
def img2html(self, image):
"""
Parameters:
- image (PIL.Image or np.ndarray): The image of the table to be converted.
Return:
- HTML (str): A string representing the HTML structure with content of the table.
"""
if isinstance(image, Image.Image):
image = np.asarray(image)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
pred_res, _ = self.table_sys(image)
pred_html = pred_res['html']
# res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
# "</table></body></html>","") + "</table></td>\n"
return pred_html
def parse_args(self, **kwargs):
parser = init_args()
model_dir = kwargs.get('model_dir')
table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR) # noqa: F405
table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT) # noqa: F405
det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR) # noqa: F405
rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR) # noqa: F405
rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT) # noqa: F405
device = kwargs.get('device', 'cpu')
use_gpu = True if device.startswith('cuda') else False
config = {
'use_gpu': use_gpu,
'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN), # noqa: F405
'table_algorithm': 'TableMaster',
'table_model_dir': table_model_dir,
'table_char_dict_path': table_char_dict_path,
'det_model_dir': det_model_dir,
'rec_model_dir': rec_model_dir,
'rec_char_dict_path': rec_char_dict_path,
}
parser.set_defaults(**config)
return parser.parse_args([])
......@@ -4,6 +4,7 @@ import os
import re
import statistics
import time
import warnings
from typing import List
import cv2
......@@ -11,6 +12,7 @@ import fitz
import torch
import numpy as np
from loguru import logger
from tqdm import tqdm
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.ocr_content_type import BlockType, ContentType
......@@ -21,20 +23,9 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
from concurrent.futures import ThreadPoolExecutor
try:
import torchtext
if torchtext.__version__ >= '0.18.0':
torchtext.disable_torchtext_deprecation_warning()
except ImportError:
pass
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.post_proc.para_split_v3 import para_split
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
......@@ -42,7 +33,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, check_chars_is_overlap_in_span
remove_overlaps_min_spans, remove_x_overlapping_chars
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
......@@ -64,14 +55,6 @@ def __replace_STX_ETX(text_str: str):
return text_str
def __replace_0xfffd(text_str: str):
"""Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
if text_str:
s = text_str.replace('\ufffd', " ")
return s
return text_str
# 连写字符拆分
def __replace_ligatures(text: str):
ligatures = {
......@@ -84,16 +67,17 @@ def chars_to_content(span):
# 检查span中的char是否为空
if len(span['chars']) == 0:
pass
# span['content'] = ''
elif check_chars_is_overlap_in_span(span['chars']):
pass
else:
# 先给chars按char['bbox']的中心点的x坐标排序
span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
# 求char的平均宽度
char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
char_avg_width = char_width_sum / len(span['chars'])
# Calculate the width of each character
char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
# Calculate the median width
median_width = statistics.median(char_widths)
# 通过x轴重叠比率移除一部分char
span = remove_x_overlapping_chars(span, median_width)
content = ''
for char in span['chars']:
......@@ -101,13 +85,12 @@ def chars_to_content(span):
# 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格
char1 = char
char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
content += f"{char['c']} "
else:
content += char['c']
content = __replace_ligatures(content)
span['content'] = __replace_0xfffd(content)
span['content'] = __replace_ligatures(content)
del span['chars']
......@@ -122,10 +105,6 @@ def fill_char_in_spans(spans, all_chars):
spans = sorted(spans, key=lambda x: x['bbox'][1])
for char in all_chars:
# 跳过非法bbox的char
# x1, y1, x2, y2 = char['bbox']
# if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
# continue
for span in spans:
if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
......@@ -215,7 +194,7 @@ def calculate_contrast(img, img_mode) -> float:
std_dev = np.std(gray_img)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast = std_dev / (mean_value + 1e-6)
# logger.info(f"contrast: {contrast}")
# logger.debug(f"contrast: {contrast}")
return round(contrast, 2)
# @measure_time
......@@ -308,41 +287,53 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if len(need_ocr_spans) > 0:
# 初始化ocr模型
atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.3,
lang=lang
)
# atom_model_manager = AtomModelSingleton()
# ocr_model = atom_model_manager.get_atom_model(
# atom_model_name='ocr',
# ocr_show_log=False,
# det_db_box_thresh=0.3,
# lang=lang
# )
for span in need_ocr_spans:
# 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
# 计算span的对比度,低于0.20的span不进行ocr
if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
spans.remove(span)
continue
# pass
span['content'] = ''
span['score'] = 1
span['np_img'] = span_img
ocr_res = ocr_model.ocr(span_img, det=False)
if ocr_res and len(ocr_res) > 0:
if len(ocr_res[0]) > 0:
ocr_text, ocr_score = ocr_res[0][0]
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
if ocr_score > 0.5 and len(ocr_text) > 0:
span['content'] = ocr_text
span['score'] = ocr_score
else:
spans.remove(span)
# ocr_res = ocr_model.ocr(span_img, det=False)
# if ocr_res and len(ocr_res) > 0:
# if len(ocr_res[0]) > 0:
# ocr_text, ocr_score = ocr_res[0][0]
# # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
# if ocr_score > 0.5 and len(ocr_text) > 0:
# span['content'] = ocr_text
# span['score'] = float(round(ocr_score, 2))
# else:
# spans.remove(span)
return spans
def model_init(model_name: str):
from transformers import LayoutLMv3ForTokenClassification
device = torch.device(get_device())
device_name = get_device()
bf_16_support = False
if device_name.startswith("cuda"):
bf_16_support = torch.cuda.is_bf16_supported()
elif device_name.startswith("mps"):
bf_16_support = True
device = torch.device(device_name)
if model_name == 'layoutreader':
# 检测modelscope的缓存目录是否存在
layoutreader_model_dir = get_local_layoutreader_model_dir()
......@@ -357,7 +348,10 @@ def model_init(model_name: str):
model = LayoutLMv3ForTokenClassification.from_pretrained(
'hantian/layoutreader'
)
model.to(device).eval()
if bf_16_support:
model.to(device).eval().bfloat16()
else:
model.to(device).eval()
else:
logger.error('model name not allow')
exit(1)
......@@ -383,9 +377,12 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
boxes2inputs, parse_logits, prepare_inputs)
inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model)
logits = model(**inputs).logits.cpu().squeeze(0)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model)
logits = model(**inputs).logits.cpu().squeeze(0)
return parse_logits(logits, len(boxes))
......@@ -463,20 +460,20 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
if (
block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
): # 可能是双列结构,可以切细点
lines = int(block_height / line_height) + 1
lines = int(block_height / line_height)
else:
# 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细)
if block_weight > page_w * 0.4:
lines = 3
line_height = (y1 - y0) / lines
elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点)
lines = int(block_height / line_height) + 1
lines = int(block_height / line_height)
else: # 判断长宽比
if block_height / block_weight > 1.2: # 细长的不分
return [[x0, y0, x1, y1]]
else: # 不细长的还是分成两行
lines = 2
line_height = (y1 - y0) / lines
line_height = (y1 - y0) / lines
# 确定从哪个y位置开始绘制线条
current_y = y0
......@@ -492,7 +489,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
else:
return [[x0, y0, x1, y1]]
# @measure_time
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list = []
......@@ -936,17 +933,18 @@ def pdf_parse_union(
logger.warning('end_page_id is out of range, use pdf_docs length')
end_page_id = len(dataset) - 1
"""初始化启动时间"""
start_time = time.time()
# """初始化启动时间"""
# start_time = time.time()
for page_id, page in enumerate(dataset):
"""debug时输出每页解析的耗时."""
if debug_mode:
time_now = time.time()
logger.info(
f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
)
start_time = time_now
# for page_id, page in enumerate(dataset):
for page_id, page in tqdm(enumerate(dataset), total=len(dataset), desc="Processing pages"):
# """debug时输出每页解析的耗时."""
# if debug_mode:
# time_now = time.time()
# logger.info(
# f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
# )
# start_time = time_now
"""解析pdf中的每一页"""
if start_page_id <= page_id <= end_page_id:
......@@ -962,7 +960,47 @@ def pdf_parse_union(
)
pdf_info_dict[f'page_{page_id}'] = page_info
# PerformanceStats.print_stats()
need_ocr_list = []
img_crop_list = []
text_block_list = []
for pange_id, page_info in pdf_info_dict.items():
for block in page_info['preproc_blocks']:
if block['type'] in ['table', 'image']:
for sub_block in block['blocks']:
if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
text_block_list.append(sub_block)
elif block['type'] in ['text', 'title']:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(span['np_img'])
span.pop('np_img')
if len(img_crop_list) > 0:
# Get OCR results for this language's images
atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.3,
lang=lang
)
# rec_start = time.time()
ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
# Verify we have matching counts
assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
# Process OCR results for this language
for index, span in enumerate(need_ocr_list):
ocr_text, ocr_score = ocr_res_list[index]
span['content'] = ocr_text
span['score'] = float(round(ocr_score, 2))
# rec_time = time.time() - rec_start
# logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
"""分段"""
para_split(pdf_info_dict)
......
......@@ -62,7 +62,15 @@ def merge_spans_to_line(spans, threshold=0.6):
def span_block_type_compatible(span_type, block_type):
if span_type in [ContentType.Text, ContentType.InlineEquation]:
return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
return block_type in [
BlockType.Text,
BlockType.Title,
BlockType.ImageCaption,
BlockType.ImageFootnote,
BlockType.TableCaption,
BlockType.TableFootnote,
BlockType.Discarded
]
elif span_type == ContentType.InterlineEquation:
return block_type in [BlockType.InterlineEquation, BlockType.Text]
elif span_type == ContentType.Image:
......
......@@ -41,6 +41,57 @@ def check_chars_is_overlap_in_span(chars):
return False
def remove_x_overlapping_chars(span, median_width):
"""
Remove characters from a span that overlap significantly on the x-axis.
Args:
median_width:
span (dict): A span containing a list of chars, each with bbox coordinates
in the format [x0, y0, x1, y1]
Returns:
dict: The span with overlapping characters removed
"""
if 'chars' not in span or len(span['chars']) < 2:
return span
overlap_threshold = median_width * 0.3
i = 0
while i < len(span['chars']) - 1:
char1 = span['chars'][i]
char2 = span['chars'][i + 1]
# Calculate overlap width
x_left = max(char1['bbox'][0], char2['bbox'][0])
x_right = min(char1['bbox'][2], char2['bbox'][2])
if x_right > x_left: # There is overlap
overlap_width = x_right - x_left
if overlap_width > overlap_threshold:
if char1['c'] == char2['c'] or char1['c'] == ' ' or char2['c'] == ' ':
# Determine which character to remove
width1 = char1['bbox'][2] - char1['bbox'][0]
width2 = char2['bbox'][2] - char2['bbox'][0]
if width1 < width2:
# Remove the narrower character
span['chars'].pop(i)
else:
span['chars'].pop(i + 1)
else:
i += 1
# Don't increment i since we need to check the new pair
else:
i += 1
else:
i += 1
return span
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
......
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_base
max_seq_len: 1536
load_pretrained: True
pretrained: './models/unimernet_base/pytorch_model.pth'
tokenizer_config:
path: ./models/unimernet_base
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
\ No newline at end of file
AUG:
DETR: true
CACHE_DIR: ~/cache/huggingface
CUDNN_BENCHMARK: false
DATALOADER:
ASPECT_RATIO_GROUPING: true
FILTER_EMPTY_ANNOTATIONS: false
NUM_WORKERS: 4
REPEAT_THRESHOLD: 0.0
SAMPLER_TRAIN: TrainingSampler
DATASETS:
PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
PROPOSAL_FILES_TEST: []
PROPOSAL_FILES_TRAIN: []
TEST:
- scihub_train
TRAIN:
- scihub_train
GLOBAL:
HACK: 1.0
ICDAR_DATA_DIR_TEST: ''
ICDAR_DATA_DIR_TRAIN: ''
INPUT:
CROP:
ENABLED: true
SIZE:
- 384
- 600
TYPE: absolute_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN:
- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal
MODEL:
ANCHOR_GENERATOR:
ANGLES:
- - -90
- 0
- 90
ASPECT_RATIOS:
- - 0.5
- 1.0
- 2.0
NAME: DefaultAnchorGenerator
OFFSET: 0.0
SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512
BACKBONE:
FREEZE_AT: 2
NAME: build_vit_fpn_backbone
CONFIG_PATH: ''
DEVICE: cuda
FPN:
FUSE_TYPE: sum
IN_FEATURES:
- layer3
- layer5
- layer7
- layer11
NORM: ''
OUT_CHANNELS: 256
IMAGE_ONLY: true
KEYPOINT_ON: false
LOAD_PROPOSALS: false
MASK_ON: true
META_ARCHITECTURE: VLGeneralizedRCNN
PANOPTIC_FPN:
COMBINE:
ENABLED: true
INSTANCES_CONFIDENCE_THRESH: 0.5
OVERLAP_THRESH: 0.5
STUFF_AREA_LIMIT: 4096
INSTANCE_LOSS_WEIGHT: 1.0
PIXEL_MEAN:
- 127.5
- 127.5
- 127.5
PIXEL_STD:
- 127.5
- 127.5
- 127.5
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
RESNETS:
DEFORM_MODULATED: false
DEFORM_NUM_GROUPS: 1
DEFORM_ON_PER_STAGE:
- false
- false
- false
- false
DEPTH: 50
NORM: FrozenBN
NUM_GROUPS: 1
OUT_FEATURES:
- res4
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: true
WIDTH_PER_GROUP: 64
RETINANET:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
FOCAL_LOSS_ALPHA: 0.25
FOCAL_LOSS_GAMMA: 2.0
IN_FEATURES:
- p3
- p4
- p5
- p6
- p7
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.4
- 0.5
NMS_THRESH_TEST: 0.5
NORM: ''
NUM_CLASSES: 10
NUM_CONVS: 4
PRIOR_PROB: 0.01
SCORE_THRESH_TEST: 0.05
SMOOTH_L1_LOSS_BETA: 0.1
TOPK_CANDIDATES_TEST: 1000
ROI_BOX_CASCADE_HEAD:
BBOX_REG_WEIGHTS:
- - 10.0
- 10.0
- 5.0
- 5.0
- - 20.0
- 20.0
- 10.0
- 10.0
- - 30.0
- 30.0
- 15.0
- 15.0
IOUS:
- 0.5
- 0.6
- 0.7
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
CLS_AGNOSTIC_BBOX_REG: true
CONV_DIM: 256
FC_DIM: 1024
NAME: FastRCNNConvFCHead
NORM: ''
NUM_CONV: 0
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
SMOOTH_L1_BETA: 0.0
TRAIN_ON_PRED_BOXES: false
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
IN_FEATURES:
- p2
- p3
- p4
- p5
IOU_LABELS:
- 0
- 1
IOU_THRESHOLDS:
- 0.5
NAME: CascadeROIHeads
NMS_THRESH_TEST: 0.5
NUM_CLASSES: 10
POSITIVE_FRACTION: 0.25
PROPOSAL_APPEND_GT: true
SCORE_THRESH_TEST: 0.05
ROI_KEYPOINT_HEAD:
CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512
LOSS_WEIGHT: 1.0
MIN_KEYPOINTS_PER_IMAGE: 1
NAME: KRCNNConvDeconvUpsampleHead
NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
NUM_KEYPOINTS: 17
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: false
CONV_DIM: 256
NAME: MaskRCNNConvUpsampleHead
NORM: ''
NUM_CONV: 4
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
RPN:
BATCH_SIZE_PER_IMAGE: 256
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
BOUNDARY_THRESH: -1
CONV_DIMS:
- -1
HEAD_NAME: StandardRPNHead
IN_FEATURES:
- p2
- p3
- p4
- p5
- p6
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.3
- 0.7
LOSS_WEIGHT: 1.0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOPK_TEST: 1000
POST_NMS_TOPK_TRAIN: 2000
PRE_NMS_TOPK_TEST: 1000
PRE_NMS_TOPK_TRAIN: 2000
SMOOTH_L1_BETA: 0.0
SEM_SEG_HEAD:
COMMON_STRIDE: 4
CONVS_DIM: 128
IGNORE_VALUE: 255
IN_FEATURES:
- p2
- p3
- p4
- p5
LOSS_WEIGHT: 1.0
NAME: SemSegFPNHead
NORM: GN
NUM_CLASSES: 10
VIT:
DROP_PATH: 0.1
IMG_SIZE:
- 224
- 224
NAME: layoutlmv3_base
OUT_FEATURES:
- layer3
- layer5
- layer7
- layer11
POS_TYPE: abs
WEIGHTS:
OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
SEED: 42
SOLVER:
AMP:
ENABLED: true
BACKBONE_MULTIPLIER: 1.0
BASE_LR: 0.0002
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 2000
CLIP_GRADIENTS:
CLIP_TYPE: full_model
CLIP_VALUE: 1.0
ENABLED: true
NORM_TYPE: 2.0
GAMMA: 0.1
GRADIENT_ACCUMULATION_STEPS: 1
IMS_PER_BATCH: 32
LR_SCHEDULER_NAME: WarmupCosineLR
MAX_ITER: 20000
MOMENTUM: 0.9
NESTEROV: false
OPTIMIZER: ADAMW
REFERENCE_WORLD_SIZE: 0
STEPS:
- 10000
WARMUP_FACTOR: 0.01
WARMUP_ITERS: 333
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_BIAS: null
WEIGHT_DECAY_NORM: 0.0
TEST:
AUG:
ENABLED: false
FLIP: true
MAX_SIZE: 4000
MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200
DETECTIONS_PER_IMAGE: 100
EVAL_PERIOD: 1000
EXPECTED_RESULTS: []
KEYPOINT_OKS_SIGMAS: []
PRECISE_BN:
ENABLED: false
NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0
......@@ -2,7 +2,7 @@ weights:
layoutlmv3: Layout/LayoutLMv3/model_final.pth
doclayout_yolo: Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt
yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
unimernet_small: MFR/unimernet_small_2501
unimernet_small: MFR/unimernet_hf_small_2503
struct_eqtable: TabRec/StructEqTable
tablemaster: TabRec/TableMaster
rapid_table: TabRec/RapidTable
\ No newline at end of file
import os
import shutil
import tempfile
from pathlib import Path
import click
import fitz
from loguru import logger
from pathlib import Path
import magic_pdf.model as model_config
from magic_pdf.data.batch_build_dataset import batch_build_dataset
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from magic_pdf.tools.common import batch_do_parse, do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
......@@ -87,37 +90,38 @@ without method specified, auto will be used by default.""",
default=None,
)
def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
fn = os.path.join(temp_dir, f'{path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
raise Exception(f'Unknown file suffix: {path.suffix}')
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def parse_doc(doc_path: Path):
def parse_doc(doc_path: Path, dataset: Dataset | None = None):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
if dataset is None:
pdf_data_or_dataset = read_fn(doc_path)
else:
pdf_data_or_dataset = dataset
do_parse(
output_dir,
file_name,
pdf_data,
pdf_data_or_dataset,
[],
method,
debug_able,
......@@ -130,9 +134,23 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
logger.exception(e)
if os.path.isdir(path):
doc_paths = []
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
if doc_path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(doc_path), temp_dir)
doc_path = Path(os.path.join(temp_dir, f'{doc_path.stem}.pdf'))
elif doc_path.suffix in image_suffixes:
with open(str(doc_path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f'{doc_path.stem}.pdf')
with open(fn, 'wb') as f:
f.write(pdf_bytes)
doc_path = Path(fn)
doc_paths.append(doc_path)
datasets = batch_build_dataset(doc_paths, 4, lang)
batch_do_parse(output_dir, [str(doc_path.stem) for doc_path in doc_paths], datasets, method, debug_able, lang=lang)
else:
parse_doc(Path(path))
......
......@@ -8,10 +8,10 @@ import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.dataset import Dataset, PymuDocDataset
from magic_pdf.libs.draw_bbox import draw_char_bbox
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult
from magic_pdf.model.doc_analyze_by_custom_model import (batch_doc_analyze,
doc_analyze)
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
......@@ -67,13 +67,13 @@ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_i
return output_bytes
def do_parse(
def _do_parse(
output_dir,
pdf_file_name,
pdf_bytes,
pdf_bytes_or_dataset,
model_list,
parse_method,
debug_able,
debug_able=False,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
......@@ -92,16 +92,21 @@ def do_parse(
formula_enable=None,
table_enable=None,
):
from magic_pdf.operators.models import InferenceResult
if debug_able:
logger.warning('debug mode is on')
f_draw_model_bbox = True
f_draw_line_sort_bbox = True
# f_draw_char_bbox = True
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes, start_page_id, end_page_id
)
if isinstance(pdf_bytes_or_dataset, bytes):
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes_or_dataset, start_page_id, end_page_id
)
ds = PymuDocDataset(pdf_bytes, lang=lang)
else:
ds = pdf_bytes_or_dataset
pdf_bytes = ds._raw_data
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
......@@ -109,8 +114,6 @@ def do_parse(
)
image_dir = str(os.path.basename(local_image_dir))
ds = PymuDocDataset(pdf_bytes, lang=lang)
if len(model_list) == 0:
if model_config.__use_inside_model__:
if parse_method == 'auto':
......@@ -241,5 +244,80 @@ def do_parse(
logger.info(f'local output dir is {local_md_dir}')
def do_parse(
output_dir,
pdf_file_name,
pdf_bytes_or_dataset,
model_list,
parse_method,
debug_able=False,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
parallel_count = 1
if os.environ.get('MINERU_PARALLEL_INFERENCE_COUNT'):
parallel_count = int(os.environ['MINERU_PARALLEL_INFERENCE_COUNT'])
if parallel_count > 1:
if isinstance(pdf_bytes_or_dataset, bytes):
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes_or_dataset, start_page_id, end_page_id
)
ds = PymuDocDataset(pdf_bytes, lang=lang)
else:
ds = pdf_bytes_or_dataset
batch_do_parse(output_dir, [pdf_file_name], [ds], parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
else:
_do_parse(output_dir, pdf_file_name, pdf_bytes_or_dataset, model_list, parse_method, debug_able, start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox)
def batch_do_parse(
output_dir,
pdf_file_names: list[str],
pdf_bytes_or_datasets: list[bytes | Dataset],
parse_method,
debug_able=False,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
dss = []
for v in pdf_bytes_or_datasets:
if isinstance(v, bytes):
dss.append(PymuDocDataset(v, lang=lang))
else:
dss.append(v)
infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
for idx, infer_result in enumerate(infer_results):
_do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
......@@ -28,12 +28,12 @@ NVIDIA drivers are already installed, and you can skip Step 2.
.. note::
``CUDA Version`` should be >= 12.1, If the displayed version number is less than 12.1, please upgrade the driver.
``CUDA Version`` should be >= 12.4, If the displayed version number is less than 12.4, please upgrade the driver.
.. code:: text
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.34 Driver Version: 537.34 CUDA Version: 12.2 |
| NVIDIA-SMI 570.133.07 Driver Version: 572.83 CUDA Version: 12.8 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
......@@ -52,7 +52,7 @@ If no driver is installed, use the following command:
.. code:: sh
sudo apt-get update
sudo apt-get install nvidia-driver-545
sudo apt-get install nvidia-driver-570-server
Install the proprietary driver and restart your computer after
installation.
......@@ -80,15 +80,15 @@ Specify Python version 3.10.
.. code:: sh
conda create -n MinerU python=3.10
conda activate MinerU
conda create -n mineru 'python<3.13' -y
conda activate mineru
5. Install Applications
~~~~~~~~~~~~~~~~~~~~~~~
.. code:: sh
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
pip install -U magic-pdf[full]
.. admonition:: Important
:class: tip
......@@ -99,7 +99,7 @@ Specify Python version 3.10.
magic-pdf --version
If the version number is less than 0.7.0, please report the issue.
If the version number is less than 1.3.0, please report the issue.
6. Download Models
~~~~~~~~~~~~~~~~~~
......@@ -126,7 +126,7 @@ Download a sample file from the repository and test it.
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
9. Test CUDA Acceleration
......@@ -150,23 +150,6 @@ to test CUDA acceleration:
magic-pdf -p small_ocr.pdf -o ./output
10. Enable CUDA Acceleration for OCR
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. Download ``paddlepaddle-gpu``. Installation will automatically enable
OCR acceleration.
.. code:: sh
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
2. Test OCR acceleration with the following command:
.. code:: sh
magic-pdf -p small_ocr.pdf -o ./output
.. _windows_10_or_11_section:
......@@ -176,11 +159,12 @@ Windows 10/11
1. Install CUDA and cuDNN
~~~~~~~~~~~~~~~~~~~~~~~~~
Required versions: CUDA 11.8 + cuDNN 8.7.0
You need to install a CUDA version that is compatible with torch's requirements. Currently, torch supports CUDA 11.8/12.4/12.6.
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 11.8: https://developer.nvidia.com/cuda-11-8-0-download-archive
- cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x:
https://developer.nvidia.com/rdp/cudnn-archive
2. Install Anaconda
~~~~~~~~~~~~~~~~~~~
......@@ -192,19 +176,17 @@ Download link: https://repo.anaconda.com/archive/Anaconda3-2024.06-1-Windows-x86
3. Create an Environment Using Conda
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Python version must be 3.10.
::
conda create -n MinerU python=3.10
conda activate MinerU
conda create -n mineru 'python<3.13' -y
conda activate mineru
4. Install Applications
~~~~~~~~~~~~~~~~~~~~~~~
::
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
pip install -U magic-pdf[full]
.. admonition:: Important
:class: tip
......@@ -215,7 +197,7 @@ Python version must be 3.10.
magic-pdf --version
If the version number is less than 0.7.0, please report it in the issues section.
If the version number is less than 1.3.0, please report it in the issues section.
5. Download Models
~~~~~~~~~~~~~~~~~~
......@@ -242,7 +224,7 @@ Download a sample file from the repository and test it.
.. code:: powershell
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
8. Test CUDA Acceleration
......@@ -251,23 +233,12 @@ Download a sample file from the repository and test it.
If your graphics card has at least 8GB of VRAM, follow these steps to
test CUDA-accelerated parsing performance.
1. **Overwrite the installation of torch and torchvision** supporting CUDA.
1. **Overwrite the installation of torch and torchvision** supporting CUDA.(Please select the appropriate index-url based on your CUDA version. For more details, refer to the [PyTorch official website](https://pytorch.org/get-started/locally/).)
.. code:: sh
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
.. admonition:: Important
:class: tip
❗️Ensure the following versions are specified in the command:
.. code:: sh
pip install --force-reinstall torch==2.6.0 torchvision==0.21.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu124
torch==2.3.1 torchvision==0.18.1
These are the highest versions we support. Installing higher versions without specifying them will cause the program to fail.
2. **Modify the value of ``"device-mode"``** in the ``magic-pdf.json``
configuration file located in your user directory.
......@@ -283,19 +254,3 @@ test CUDA-accelerated parsing performance.
::
magic-pdf -p small_ocr.pdf -o ./output
9. Enable CUDA Acceleration for OCR
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1. **Download paddlepaddle-gpu**, which will automatically enable OCR
acceleration upon installation.
::
pip install paddlepaddle-gpu==2.6.1
2. **Run the following command to test OCR acceleration**:
::
magic-pdf -p small_ocr.pdf -o ./output
......@@ -37,53 +37,57 @@ Also you can try `online demo <https://www.modelscope.cn/studios/OpenDataLab/Min
}
</style>
<table>
<tr>
<td colspan="3" rowspan="2">Operating System</td>
</tr>
<tr>
<td>Ubuntu 22.04 LTS</td>
<td>Windows 10 / 11</td>
<td>macOS 11+</td>
</tr>
<tr>
<td colspan="3">CPU</td>
<td>x86_64(unsupported ARM Linux)</td>
<td>x86_64(unsupported ARM Windows)</td>
<td>x86_64 / arm64</td>
</tr>
<tr>
<td colspan="3">Memory</td>
<td colspan="3">16GB or more, recommended 32GB+</td>
</tr>
<tr>
<td colspan="3">Python Version</td>
<td colspan="3">3.10(Please make sure to create a Python 3.10 virtual environment using conda)</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver Version</td>
<td>latest (Proprietary Driver)</td>
<td>latest</td>
<td>None</td>
</tr>
<tr>
<td colspan="3">CUDA Environment</td>
<td>Automatic installation [12.1 (pytorch) + 11.8 (paddle)]</td>
<td>11.8 (manual installation) + cuDNN v8.7.0 (manual installation)</td>
<td>None</td>
</tr>
<tr>
<td rowspan="2">GPU Hardware Support List</td>
<td colspan="2">Minimum Requirement 8G+ VRAM</td>
<td colspan="2">3060ti/3070/4060<br>
8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
<td rowspan="2">None</td>
</tr>
<tr>
<td colspan="2">Recommended Configuration 10G+ VRAM</td>
<td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
</td>
</tr>
<tr>
<td colspan="3" rowspan="2">Operating System</td>
</tr>
<tr>
<td>Linux after 2019</td>
<td>Windows 10 / 11</td>
<td>macOS 11+</td>
</tr>
<tr>
<td colspan="3">CPU</td>
<td>x86_64 / arm64</td>
<td>x86_64(unsupported ARM Windows)</td>
<td>x86_64 / arm64</td>
</tr>
<tr>
<td colspan="3">Memory Requirements</td>
<td colspan="3">16GB or more, recommended 32GB+</td>
</tr>
<tr>
<td colspan="3">Storage Requirements</td>
<td colspan="3">20GB or more, with a preference for SSD</td>
</tr>
<tr>
<td colspan="3">Python Version</td>
<td colspan="3">3.10~3.12</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver Version</td>
<td>latest (Proprietary Driver)</td>
<td>latest</td>
<td>None</td>
</tr>
<tr>
<td colspan="3">CUDA Environment</td>
<td>11.8/12.4/12.6</td>
<td>11.8/12.4/12.6</td>
<td>None</td>
</tr>
<tr>
<td colspan="3">CANN Environment(NPU support)</td>
<td>8.0+(Ascend 910b)</td>
<td>None</td>
<td>None</td>
</tr>
<tr>
<td rowspan="2">GPU/MPS Hardware Support List</td>
<td colspan="2">GPU VRAM 6GB or more</td>
<td colspan="2">All GPUs with Tensor Cores produced from Volta(2017) onwards.<br>
More than 6GB VRAM </td>
<td rowspan="2">apple slicon</td>
</tr>
</table>
......@@ -93,9 +97,9 @@ Create an environment
.. code-block:: shell
conda create -n MinerU python=3.10
conda activate MinerU
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
conda create -n mineru 'python<3.13' -y
conda activate mineru
pip install -U "magic-pdf[full]"
Download model weight files
......
......@@ -10,7 +10,7 @@
.. admonition:: Important
:class: tip
Docker 需要至少 16GB 显存的 GPU,并且所有加速功能默认启用。
Docker 需要至少 6GB 显存的 GPU,并且所有加速功能默认启用。
在运行此 Docker 容器之前,您可以使用以下命令检查您的设备是否支持 Docker 上的 CUDA 加速。
......@@ -20,10 +20,10 @@
.. code:: sh
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
docker build -t mineru:latest .
docker run --rm -it --gpus=all mineru:latest /bin/bash
magic-pdf --help
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/docker/china/Dockerfile -O Dockerfile
docker build -t mineru:latest .
docker run -it --name mineru --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
magic-pdf --help
.. _ubuntu_22_04_lts_section:
......@@ -42,12 +42,12 @@ Ubuntu 22.04 LTS
.. admonition:: Important
:class: tip
``CUDA Version`` 显示的版本号应 >=12.1,如显示的版本号小于12.1,请升级驱动
``CUDA Version`` 显示的版本号应 >= 12.4,如显示的版本号小于12.4,请升级驱动
.. code:: text
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 537.34 Driver Version: 537.34 CUDA Version: 12.2 |
| NVIDIA-SMI 570.133.07 Driver Version: 572.83 CUDA Version: 12.8 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
......@@ -66,7 +66,7 @@ Ubuntu 22.04 LTS
.. code:: bash
sudo apt-get update
sudo apt-get install nvidia-driver-545
sudo apt-get install nvidia-driver-570-server
安装专有驱动,安装完成后,重启电脑
......@@ -89,19 +89,17 @@ Ubuntu 22.04 LTS
4. 使用 conda 创建环境
---------------------
需指定 python 版本为3.10
.. code:: bash
conda create -n MinerU python=3.10
conda activate MinerU
conda create -n mineru 'python<3.13' -y
conda activate mineru
5. 安装应用
-----------
.. code:: bash
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
.. admonition:: Important
:class: tip
......@@ -112,7 +110,7 @@ Ubuntu 22.04 LTS
magic-pdf --version
如果版本号小于0.7.0,请到issue中向我们反馈
如果版本号小于1.3.0,请到issue中向我们反馈
6. 下载模型
-----------
......@@ -136,7 +134,7 @@ Ubuntu 22.04 LTS
.. code:: bash
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/small_ocr.pdf
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/demo/pdfs/small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
9. 测试CUDA加速
......@@ -163,27 +161,8 @@ Ubuntu 22.04 LTS
.. admonition:: Tip
:class: tip
CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断,通常情况下, ``layout detection cost`` 和 ``mfr time`` 应提速10倍以上。
10. 为 ocr 开启 cuda 加速
---------------------
**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
.. code:: bash
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断,通常情况下,cuda应比cpu更快。
**2.运行以下命令测试ocr加速效果**
.. code:: bash
magic-pdf -p small_ocr.pdf -o ./output
.. admonition:: Tip
:class: tip
CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断,通常情况下, ``ocr cost`` 应提速10倍以上。
.. _windows_10_or_11_section:
......@@ -194,10 +173,12 @@ Windows 10/11
1. 安装 cuda 和 cuDNN
------------------
需要安装的版本 CUDA 11.8 + cuDNN 8.7.0
需要安装符合torch要求的cuda版本,torch目前支持11.8/12.4/12.6
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- CUDA 12.4 https://developer.nvidia.com/cuda-12-4-0-download-archive
- CUDA 12.6 https://developer.nvidia.com/cuda-12-6-0-download-archive
- CUDA 11.8 https://developer.nvidia.com/cuda-11-8-0-download-archive
- cuDNN v8.7.0 (November 28th, 2022), for CUDA 11.x https://developer.nvidia.com/rdp/cudnn-archive
2. 安装 anaconda
---------------
......@@ -209,19 +190,17 @@ Windows 10/11
3. 使用 conda 创建环境
---------------------
需指定python版本为3.10
.. code:: bash
conda create -n MinerU python=3.10
conda activate MinerU
conda create -n mineru 'python<3.13' -y
conda activate mineru
4. 安装应用
-----------
.. code:: bash
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
pip install -U magic-pdf[full] -i https://mirrors.aliyun.com/pypi/simple
.. admonition:: Important
:class: tip
......@@ -232,7 +211,7 @@ Windows 10/11
magic-pdf --version
如果版本号小于0.7.0,请到issue中向我们反馈
如果版本号小于1.3.0,请到issue中向我们反馈
5. 下载模型
-----------
......@@ -256,7 +235,7 @@ Windows 10/11
.. code:: powershell
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
wget https://github.com/opendatalab/MinerU/raw/master/demo/pdfs/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
8. 测试 CUDA 加速
......@@ -264,22 +243,13 @@ Windows 10/11
如果您的显卡显存大于等于 **8GB**,可以进行以下流程,测试 CUDA 解析加速效果
**1.覆盖安装支持cuda的torch和torchvision**
.. code:: bash
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
**1.覆盖安装支持cuda的torch和torchvision**(请根据cuda版本选择合适的index-url,具体可参考[torch官网](https://pytorch.org/get-started/locally/))
.. admonition:: Important
:class: tip
务必在命令中指定以下版本
.. code:: bash
.. code:: bash
torch==2.3.1 torchvision==0.18.1
pip install --force-reinstall torch==2.6.0 torchvision==0.21.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu124
这是我们支持的最高版本,如果不指定版本会自动安装更高版本导致程序无法运行
**2.修改【用户目录】中配置文件magic-pdf.json中”device-mode”的值**
......@@ -298,24 +268,5 @@ Windows 10/11
.. admonition:: Tip
:class: tip
CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断,通常情况下, ``layout detection time`` 和 ``mfr time`` 应提速10倍以上。
9. 为 ocr 开启 cuda 加速
--------------------
**1.下载paddlepaddle-gpu, 安装完成后会自动开启ocr加速**
.. code:: bash
pip install paddlepaddle-gpu==2.6.1
**2.运行以下命令测试ocr加速效果**
.. code:: bash
magic-pdf -p small_ocr.pdf -o ./output
.. admonition:: Tip
:class: tip
CUDA 加速是否生效可以根据 log 中输出的各个阶段的耗时来简单判断,通常情况下, cuda会比cpu更快。
CUDA 加速是否生效可以根据 log 中输出的各个阶段 cost 耗时来简单判断,通常情况下, ``ocr time`` 应提速10倍以上。
......@@ -24,53 +24,58 @@
}
</style>
<table>
<tr>
<td colspan="3" rowspan="2">操作系统</td>
</tr>
<tr>
<td>Ubuntu 22.04 LTS</td>
<td>Windows 10 / 11</td>
<td>macOS 11+</td>
</tr>
<tr>
<td colspan="3">CPU</td>
<td>x86_64(暂不支持ARM Linux)</td>
<td>x86_64(暂不支持ARM Windows)</td>
<td>x86_64 / arm64</td>
</tr>
<tr>
<td colspan="3">内存</td>
<td colspan="3">大于等于16GB,推荐32G以上</td>
</tr>
<tr>
<td colspan="3">python版本</td>
<td colspan="3">3.10 (请务必通过conda创建3.10虚拟环境)</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver 版本</td>
<td>latest(专有驱动)</td>
<td>latest</td>
<td>None</td>
</tr>
<tr>
<td colspan="3">CUDA环境</td>
<td>自动安装[12.1(pytorch)+11.8(paddle)]</td>
<td>11.8(手动安装)+cuDNN v8.7.0(手动安装)</td>
<td>None</td>
</tr>
<tr>
<td rowspan="2">GPU硬件支持列表</td>
<td colspan="2">最低要求 8G+显存</td>
<td colspan="2">3060ti/3070/4060<br>
8G显存可开启layout、公式识别和ocr加速</td>
<td rowspan="2">None</td>
</tr>
<tr>
<td colspan="2">推荐配置 10G+显存</td>
<td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
10G显存及以上可以同时开启layout、公式识别和ocr加速和表格识别加速<br>
</td>
</tr>
<tr>
<td colspan="3" rowspan="2">操作系统</td>
</tr>
<tr>
<td>Linux after 2019</td>
<td>Windows 10 / 11</td>
<td>macOS 11+</td>
</tr>
<tr>
<td colspan="3">CPU</td>
<td>x86_64 / arm64</td>
<td>x86_64(暂不支持ARM Windows)</td>
<td>x86_64 / arm64</td>
</tr>
<tr>
<td colspan="3">内存</td>
<td colspan="3">大于等于16GB,推荐32G以上</td>
</tr>
<tr>
<td colspan="3">存储空间</td>
<td colspan="3">大于等于20GB,推荐使用SSD以获得最佳性能</td>
</tr>
<tr>
<td colspan="3">python版本</td>
<td colspan="3">>=3.9,<=3.12</td>
</tr>
<tr>
<td colspan="3">Nvidia Driver 版本</td>
<td>latest(专有驱动)</td>
<td>latest</td>
<td>None</td>
</tr>
<tr>
<td colspan="3">CUDA环境</td>
<td>11.8/12.4/12.6</td>
<td>11.8/12.4/12.6</td>
<td>None</td>
</tr>
<tr>
<td colspan="3">CANN环境(NPU支持)</td>
<td>8.0+(Ascend 910b)</td>
<td>None</td>
<td>None</td>
</tr>
<tr>
<td rowspan="2">GPU/MPS 硬件支持列表</td>
<td colspan="2">显存6G以上</td>
<td colspan="2">
Volta(2017)及之后生产的全部带Tensor Core的GPU <br>
6G显存及以上</td>
<td rowspan="2">apple slicon</td>
</tr>
</table>
......@@ -79,9 +84,9 @@
.. code-block:: shell
conda create -n MinerU python=3.10
conda activate MinerU
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
conda create -n mineru 'python<3.13' -y
conda activate mineru
pip install -U "magic-pdf[full]" -i https://mirrors.aliyun.com/pypi/simple
下载模型权重文件
......
......@@ -23,10 +23,10 @@ RUN apt-get update && \
COPY requirements.txt .
RUN python -m venv /app/venv && \
. /app/venv/bin/activate && \
pip install -r requirements.txt && \
pip uninstall -y paddlepaddle && \
pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
paddlepaddle-gpu==3.0.0rc1
pip install -r requirements.txt
# pip uninstall -y paddlepaddle && \
# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
# paddlepaddle-gpu==3.0.0rc1
# Download models
COPY download_models.py .
......@@ -51,8 +51,8 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/*
# Create volume for paddleocr models
RUN mkdir -p /root/.paddleocr
VOLUME [ "/root/.paddleocr" ]
# RUN mkdir -p /root/.paddleocr
# VOLUME [ "/root/.paddleocr" ]
# Copy the app and its configuration file
COPY entrypoint.sh /app/entrypoint.sh
......
......@@ -18,11 +18,9 @@ docker build --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_prox
## 启动命令
```
docker run --rm -it --gpus=all -v ./paddleocr:/root/.paddleocr -p 8000:8000 mineru-api
docker run --rm -it --gpus=all -p 8000:8000 mineru-api
```
初次调用 API 时会自动下载 paddleocr 的模型(约数十 MB),其余模型已包含在镜像中。
## 测试参数
访问地址:
......@@ -30,31 +28,4 @@ docker run --rm -it --gpus=all -v ./paddleocr:/root/.paddleocr -p 8000:8000 mine
```
http://localhost:8000/docs
http://127.0.0.1:8000/docs
```
## 旧版镜像地址
> 阿里云地址:docker pull registry.cn-beijing.aliyuncs.com/quincyqiang/mineru:0.1-models
>
> dockerhub地址:docker pull quincyqiang/mineru:0.1-models
## 旧版截图
### 启动命令
![](https://i-blog.csdnimg.cn/direct/bcff4f524ea5400db14421ba7cec4989.png)
具体截图请见博客:https://blog.csdn.net/yanqianglifei/article/details/141979684
### 启动日志
![](https://i-blog.csdnimg.cn/direct/4eb5657567e4415eba912179dca5c8aa.png)
### 测试参数
![](https://i-blog.csdnimg.cn/direct/8b3a2bc5908042268e8cc69756e331a2.png)
### 解析效果
![](https://i-blog.csdnimg.cn/direct/a54dcae834ae48d498fb595aca4212c3.png)
```
\ No newline at end of file
......@@ -3,6 +3,7 @@ import os
from base64 import b64encode
from glob import glob
from io import StringIO
import tempfile
from typing import Tuple, Union
import uvicorn
......@@ -10,11 +11,12 @@ from fastapi import FastAPI, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from loguru import logger
from magic_pdf.data.read_api import read_local_images, read_local_office
import magic_pdf.model as model_config
from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
from magic_pdf.data.data_reader_writer.s3 import S3DataReader, S3DataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
from magic_pdf.libs.config_reader import get_bucket_name, get_s3_config
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.operators.models import InferenceResult
......@@ -24,6 +26,9 @@ model_config.__use_inside_model__ = True
app = FastAPI()
pdf_extensions = [".pdf"]
office_extensions = [".ppt", ".pptx", ".doc", ".docx"]
image_extensions = [".png", ".jpg"]
class MemoryDataWriter(DataWriter):
def __init__(self):
......@@ -46,8 +51,8 @@ class MemoryDataWriter(DataWriter):
def init_writers(
pdf_path: str = None,
pdf_file: UploadFile = None,
file_path: str = None,
file: UploadFile = None,
output_path: str = None,
output_image_path: str = None,
) -> Tuple[
......@@ -59,19 +64,19 @@ def init_writers(
Initialize writers based on path type
Args:
pdf_path: PDF file path (local path or S3 path)
pdf_file: Uploaded PDF file object
file_path: file path (local path or S3 path)
file: Uploaded file object
output_path: Output directory path
output_image_path: Image output directory path
Returns:
Tuple[writer, image_writer, pdf_bytes]: Returns initialized writer tuple and PDF
file content
Tuple[writer, image_writer, file_bytes]: Returns initialized writer tuple and file content
"""
if pdf_path:
is_s3_path = pdf_path.startswith("s3://")
file_extension:str = None
if file_path:
is_s3_path = file_path.startswith("s3://")
if is_s3_path:
bucket = get_bucket_name(pdf_path)
bucket = get_bucket_name(file_path)
ak, sk, endpoint = get_s3_config(bucket)
writer = S3DataWriter(
......@@ -84,25 +89,29 @@ def init_writers(
temp_reader = S3DataReader(
"", bucket=bucket, ak=ak, sk=sk, endpoint_url=endpoint
)
pdf_bytes = temp_reader.read(pdf_path)
file_bytes = temp_reader.read(file_path)
file_extension = os.path.splitext(file_path)[1]
else:
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
with open(file_path, "rb") as f:
file_bytes = f.read()
file_extension = os.path.splitext(file_path)[1]
else:
# 处理上传的文件
pdf_bytes = pdf_file.file.read()
file_bytes = file.file.read()
file_extension = os.path.splitext(file.filename)[1]
writer = FileBasedDataWriter(output_path)
image_writer = FileBasedDataWriter(output_image_path)
os.makedirs(output_image_path, exist_ok=True)
return writer, image_writer, pdf_bytes
return writer, image_writer, file_bytes, file_extension
def process_pdf(
pdf_bytes: bytes,
def process_file(
file_bytes: bytes,
file_extension: str,
parse_method: str,
image_writer: Union[S3DataWriter, FileBasedDataWriter],
) -> Tuple[InferenceResult, PipeResult]:
......@@ -110,14 +119,30 @@ def process_pdf(
Process PDF file content
Args:
pdf_bytes: Binary content of PDF file
file_bytes: Binary content of file
file_extension: file extension
parse_method: Parse method ('ocr', 'txt', 'auto')
image_writer: Image writer
Returns:
Tuple[InferenceResult, PipeResult]: Returns inference result and pipeline result
"""
ds = PymuDocDataset(pdf_bytes)
ds = Union[PymuDocDataset, ImageDataset]
if file_extension in pdf_extensions:
ds = PymuDocDataset(file_bytes)
elif file_extension in office_extensions:
# 需要使用office解析
temp_dir = tempfile.mkdtemp()
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
f.write(file_bytes)
ds = read_local_office(temp_dir)[0]
elif file_extension in image_extensions:
# 需要使用ocr解析
temp_dir = tempfile.mkdtemp()
with open(os.path.join(temp_dir, f"temp_file.{file_extension}"), "wb") as f:
f.write(file_bytes)
ds = read_local_images(temp_dir)[0]
infer_result: InferenceResult = None
pipe_result: PipeResult = None
......@@ -145,13 +170,13 @@ def encode_image(image_path: str) -> str:
@app.post(
"/pdf_parse",
"/file_parse",
tags=["projects"],
summary="Parse PDF files (supports local files and S3)",
summary="Parse files (supports local files and S3)",
)
async def pdf_parse(
pdf_file: UploadFile = None,
pdf_path: str = None,
async def file_parse(
file: UploadFile = None,
file_path: str = None,
parse_method: str = "auto",
is_json_md_dump: bool = False,
output_dir: str = "output",
......@@ -165,10 +190,10 @@ async def pdf_parse(
to the specified directory.
Args:
pdf_file: The PDF file to be parsed. Must not be specified together with
`pdf_path`
pdf_path: The path to the PDF file to be parsed. Must not be specified together
with `pdf_file`
file: The PDF file to be parsed. Must not be specified together with
`file_path`
file_path: The path to the PDF file to be parsed. Must not be specified together
with `file`
parse_method: Parsing method, can be auto, ocr, or txt. Default is auto. If
results are not satisfactory, try ocr
is_json_md_dump: Whether to write parsed data to .json and .md files. Default
......@@ -181,31 +206,31 @@ async def pdf_parse(
return_content_list: Whether to return parsed PDF content list. Default to False
"""
try:
if (pdf_file is None and pdf_path is None) or (
pdf_file is not None and pdf_path is not None
if (file is None and file_path is None) or (
file is not None and file_path is not None
):
return JSONResponse(
content={"error": "Must provide either pdf_file or pdf_path"},
content={"error": "Must provide either file or file_path"},
status_code=400,
)
# Get PDF filename
pdf_name = os.path.basename(pdf_path if pdf_path else pdf_file.filename).split(
file_name = os.path.basename(file_path if file_path else file.filename).split(
"."
)[0]
output_path = f"{output_dir}/{pdf_name}"
output_path = f"{output_dir}/{file_name}"
output_image_path = f"{output_path}/images"
# Initialize readers/writers and get PDF content
writer, image_writer, pdf_bytes = init_writers(
pdf_path=pdf_path,
pdf_file=pdf_file,
writer, image_writer, file_bytes, file_extension = init_writers(
file_path=file_path,
file=file,
output_path=output_path,
output_image_path=output_image_path,
)
# Process PDF
infer_result, pipe_result = process_pdf(pdf_bytes, parse_method, image_writer)
infer_result, pipe_result = process_file(file_bytes, file_extension, parse_method, image_writer)
# Use MemoryDataWriter to get results
content_list_writer = MemoryDataWriter()
......@@ -226,23 +251,23 @@ async def pdf_parse(
# If results need to be saved
if is_json_md_dump:
writer.write_string(
f"{pdf_name}_content_list.json", content_list_writer.get_value()
f"{file_name}_content_list.json", content_list_writer.get_value()
)
writer.write_string(f"{pdf_name}.md", md_content)
writer.write_string(f"{file_name}.md", md_content)
writer.write_string(
f"{pdf_name}_middle.json", middle_json_writer.get_value()
f"{file_name}_middle.json", middle_json_writer.get_value()
)
writer.write_string(
f"{pdf_name}_model.json",
f"{file_name}_model.json",
json.dumps(model_json, indent=4, ensure_ascii=False),
)
# Save visualization results
pipe_result.draw_layout(os.path.join(output_path, f"{pdf_name}_layout.pdf"))
pipe_result.draw_span(os.path.join(output_path, f"{pdf_name}_spans.pdf"))
pipe_result.draw_layout(os.path.join(output_path, f"{file_name}_layout.pdf"))
pipe_result.draw_span(os.path.join(output_path, f"{file_name}_spans.pdf"))
pipe_result.draw_line_sort(
os.path.join(output_path, f"{pdf_name}_line_sort.pdf")
os.path.join(output_path, f"{file_name}_line_sort.pdf")
)
infer_result.draw_model(os.path.join(output_path, f"{pdf_name}_model.pdf"))
infer_result.draw_model(os.path.join(output_path, f"{file_name}_model.pdf"))
# Build return data
data = {}
......
......@@ -4,12 +4,13 @@ from huggingface_hub import snapshot_download
if __name__ == "__main__":
mineru_patterns = [
"models/Layout/LayoutLMv3/*",
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_small_2501/*",
"models/TabRec/TableMaster/*",
"models/TabRec/StructEqTable/*",
"models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download(
"opendatalab/PDF-Extract-Kit-1.0",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment