Unverified Commit 845a3ff0 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #969 from opendatalab/release-0.9.3

Release 0.9.3
parents d0558abb 6083e109
...@@ -7,7 +7,7 @@ from PIL import Image ...@@ -7,7 +7,7 @@ from PIL import Image
import numpy as np import numpy as np
class ppTableModel(object): class TableMasterPaddleModel(object):
""" """
This class is responsible for converting image of table into HTML format using a pre-trained model. This class is responsible for converting image of table into HTML format using a pre-trained model.
......
...@@ -77,14 +77,12 @@ def __is_list_or_index_block(block): ...@@ -77,14 +77,12 @@ def __is_list_or_index_block(block):
# 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格) # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格)
if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
# block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 and
block['bbox_fs'][2] - last_line['bbox'][2] > line_height block['bbox_fs'][2] - last_line['bbox'][2] > line_height
): ):
multiple_para_flag = True multiple_para_flag = True
for line in block['lines']: for line in block['lines']:
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2 line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2 block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
if ( if (
...@@ -102,13 +100,13 @@ def __is_list_or_index_block(block): ...@@ -102,13 +100,13 @@ def __is_list_or_index_block(block):
if span_type == ContentType.Text: if span_type == ContentType.Text:
line_text += span['content'].strip() line_text += span['content'].strip()
# 添加所有文本,包括空行,保持与block['lines']长度一致
lines_text_list.append(line_text) lines_text_list.append(line_text)
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
left_close_num += 1 left_close_num += 1
elif line['bbox'][0] - block['bbox_fs'][0] > line_height: elif line['bbox'][0] - block['bbox_fs'][0] > line_height:
# logger.info(f"{line_text}, {block['bbox_fs']}, {line['bbox']}")
left_not_close_num += 1 left_not_close_num += 1
# 计算右侧是否顶格 # 计算右侧是否顶格
...@@ -117,7 +115,6 @@ def __is_list_or_index_block(block): ...@@ -117,7 +115,6 @@ def __is_list_or_index_block(block):
else: else:
# 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值
closed_area = 0.26 * block_weight closed_area = 0.26 * block_weight
# closed_area = 5 * line_height
if block['bbox_fs'][2] - line['bbox'][2] > closed_area: if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
right_not_close_num += 1 right_not_close_num += 1
...@@ -128,6 +125,7 @@ def __is_list_or_index_block(block): ...@@ -128,6 +125,7 @@ def __is_list_or_index_block(block):
num_start_count = 0 num_start_count = 0
num_end_count = 0 num_end_count = 0
flag_end_count = 0 flag_end_count = 0
if len(lines_text_list) > 0: if len(lines_text_list) > 0:
for line_text in lines_text_list: for line_text in lines_text_list:
if len(line_text) > 0: if len(line_text) > 0:
...@@ -138,11 +136,10 @@ def __is_list_or_index_block(block): ...@@ -138,11 +136,10 @@ def __is_list_or_index_block(block):
if line_text[-1].isdigit(): if line_text[-1].isdigit():
num_end_count += 1 num_end_count += 1
if flag_end_count / len(lines_text_list) >= 0.8:
line_end_flag = True
if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8: if num_start_count / len(lines_text_list) >= 0.8 or num_end_count / len(lines_text_list) >= 0.8:
line_num_flag = True line_num_flag = True
if flag_end_count / len(lines_text_list) >= 0.8:
line_end_flag = True
# 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index
if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8) if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
...@@ -176,7 +173,7 @@ def __is_list_or_index_block(block): ...@@ -176,7 +173,7 @@ def __is_list_or_index_block(block):
# 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item # 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item
elif line_end_flag: elif line_end_flag:
for i, line in enumerate(block['lines']): for i, line in enumerate(block['lines']):
if lines_text_list[i][-1] in LIST_END_FLAG: if len(lines_text_list[i]) > 0 and lines_text_list[i][-1] in LIST_END_FLAG:
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
if i + 1 < len(block['lines']): if i + 1 < len(block['lines']):
block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
...@@ -187,13 +184,14 @@ def __is_list_or_index_block(block): ...@@ -187,13 +184,14 @@ def __is_list_or_index_block(block):
if line_start_flag: if line_start_flag:
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
line_start_flag = False line_start_flag = False
# elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight: if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
line[ListLineTag.IS_LIST_END_LINE] = True line[ListLineTag.IS_LIST_END_LINE] = True
line_start_flag = True line_start_flag = True
# 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致 # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致
elif num_start_count >= 2 and num_start_count == flag_end_count: # 简单一点先不考虑左侧不贴边的情况 elif num_start_count >= 2 and num_start_count == flag_end_count:
for i, line in enumerate(block['lines']): for i, line in enumerate(block['lines']):
if len(lines_text_list[i]) > 0:
if lines_text_list[i][0].isdigit(): if lines_text_list[i][0].isdigit():
line[ListLineTag.IS_LIST_START_LINE] = True line[ListLineTag.IS_LIST_START_LINE] = True
if lines_text_list[i][-1] in LIST_END_FLAG: if lines_text_list[i][-1] in LIST_END_FLAG:
......
...@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import ( ...@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \ from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
ocr_prepare_bboxes_for_layout_split_v2 ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks, from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
fix_block_spans, fix_discarded_block,
fix_discarded_block, fix_block_spans_v2) fix_block_spans_v2)
from magic_pdf.pre_proc.ocr_span_list_modify import ( from magic_pdf.pre_proc.ocr_span_list_modify import (
get_qa_need_list_v2, remove_overlaps_low_confidence_spans, get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
remove_overlaps_min_spans) remove_overlaps_min_spans)
...@@ -164,7 +164,7 @@ class ModelSingleton: ...@@ -164,7 +164,7 @@ class ModelSingleton:
def do_predict(boxes: List[List[int]], model) -> List[int]: def do_predict(boxes: List[List[int]], model) -> List[int]:
from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits, from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
prepare_inputs) prepare_inputs)
inputs = boxes2inputs(boxes) inputs = boxes2inputs(boxes)
...@@ -174,8 +174,10 @@ def do_predict(boxes: List[List[int]], model) -> List[int]: ...@@ -174,8 +174,10 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
def cal_block_index(fix_blocks, sorted_bboxes): def cal_block_index(fix_blocks, sorted_bboxes):
for block in fix_blocks:
if sorted_bboxes is not None:
# 使用layoutreader排序
for block in fix_blocks:
line_index_list = [] line_index_list = []
if len(block['lines']) == 0: if len(block['lines']) == 0:
block['index'] = sorted_bboxes.index(block['bbox']) block['index'] = sorted_bboxes.index(block['bbox'])
...@@ -191,6 +193,38 @@ def cal_block_index(fix_blocks, sorted_bboxes): ...@@ -191,6 +193,38 @@ def cal_block_index(fix_blocks, sorted_bboxes):
block['virtual_lines'] = copy.deepcopy(block['lines']) block['virtual_lines'] = copy.deepcopy(block['lines'])
block['lines'] = copy.deepcopy(block['real_lines']) block['lines'] = copy.deepcopy(block['real_lines'])
del block['real_lines'] del block['real_lines']
else:
# 使用xycut排序
block_bboxes = []
for block in fix_blocks:
block_bboxes.append(block['bbox'])
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
block['virtual_lines'] = copy.deepcopy(block['lines'])
block['lines'] = copy.deepcopy(block['real_lines'])
del block['real_lines']
import numpy as np
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
random_boxes = np.array(block_bboxes)
np.random.shuffle(random_boxes)
res = []
recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
assert len(res) == len(block_bboxes)
sorted_boxes = random_boxes[np.array(res)].tolist()
for i, block in enumerate(fix_blocks):
block['index'] = sorted_boxes.index(block['bbox'])
# 生成line index
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
line_inedx = 1
for block in sorted_blocks:
for line in block['lines']:
line['index'] = line_inedx
line_inedx += 1
return fix_blocks return fix_blocks
...@@ -264,6 +298,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): ...@@ -264,6 +298,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
block['lines'].append({'bbox': line, 'spans': []}) block['lines'].append({'bbox': line, 'spans': []})
page_line_list.extend(lines) page_line_list.extend(lines)
if len(page_line_list) > 200: # layoutreader最高支持512line
return None
# 使用layoutreader排序 # 使用layoutreader排序
x_scale = 1000.0 / page_w x_scale = 1000.0 / page_w
y_scale = 1000.0 / page_h y_scale = 1000.0 / page_h
......
...@@ -5,3 +5,4 @@ weights: ...@@ -5,3 +5,4 @@ weights:
unimernet_small: MFR/unimernet_small unimernet_small: MFR/unimernet_small
struct_eqtable: TabRec/StructEqTable struct_eqtable: TabRec/StructEqTable
tablemaster: TabRec/TableMaster tablemaster: TabRec/TableMaster
rapid_table: TabRec/RapidTable
\ No newline at end of file
...@@ -14,6 +14,9 @@ from magic_pdf.pipe.TXTPipe import TXTPipe ...@@ -14,6 +14,9 @@ from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import fitz
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
def prepare_env(output_dir, pdf_file_name, method): def prepare_env(output_dir, pdf_file_name, method):
...@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method): ...@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
return local_image_dir, local_md_dir return local_image_dir, local_md_dir
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
# # 将字节数据包装在 BytesIO 对象中
# pdf_file = BytesIO(pdf_bytes)
# # 读取 PDF 的字节数据
# reader = PdfReader(pdf_file)
# # 创建一个新的 PDF 写入器
# writer = PdfWriter()
# # 将所有页面添加到新的 PDF 写入器中
# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
# if end_page_id > len(reader.pages) - 1:
# logger.warning("end_page_id is out of range, use pdf_docs length")
# end_page_id = len(reader.pages) - 1
# for i, page in enumerate(reader.pages):
# if start_page_id <= i <= end_page_id:
# writer.add_page(page)
# # 创建一个字节缓冲区来存储输出的 PDF 数据
# output_buffer = BytesIO()
# # 将 PDF 写入字节缓冲区
# writer.write(output_buffer)
# # 获取字节缓冲区的内容
# converted_pdf_bytes = output_buffer.getvalue()
# return converted_pdf_bytes
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
document = fitz.open("pdf", pdf_bytes)
output_document = fitz.open()
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
if end_page_id > len(document) - 1:
logger.warning("end_page_id is out of range, use pdf_docs length")
end_page_id = len(document) - 1
output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
output_bytes = output_document.tobytes()
return output_bytes
def do_parse( def do_parse(
output_dir, output_dir,
pdf_file_name, pdf_file_name,
...@@ -55,6 +94,8 @@ def do_parse( ...@@ -55,6 +94,8 @@ def do_parse(
f_draw_model_bbox = True f_draw_model_bbox = True
f_draw_line_sort_bbox = True f_draw_line_sort_bbox = True
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
orig_model_list = copy.deepcopy(model_list) orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method) parse_method)
...@@ -66,15 +107,18 @@ def do_parse( ...@@ -66,15 +107,18 @@ def do_parse(
if parse_method == 'auto': if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list} jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True, pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, # start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'txt': elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True, pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, # start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'ocr': elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True, pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang, # start_page_id=start_page_id, end_page_id=end_page_id,
lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable) layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
else: else:
logger.error('unknown parse method') logger.error('unknown parse method')
......
This diff is collapsed.
This diff is collapsed.
<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="224" height="72" viewBox="-29 -3.67 224 72" xml:space="preserve">
<desc>Created with Fabric.js 5.2.4</desc>
<defs>
</defs>
<rect x="0" y="0" width="100%" height="100%" fill="transparent"></rect>
<g transform="matrix(1 0 0 1 112 36)" id="7a867f58-a908-4f30-a839-fb725512b521" >
<rect style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(255,255,255); fill-rule: nonzero; opacity: 1; visibility: hidden;" vector-effect="non-scaling-stroke" x="-112" y="-36" rx="0" ry="0" width="224" height="72" />
</g>
<g transform="matrix(Infinity NaN NaN Infinity 0 0)" id="29611287-bf1c-4faf-8eb1-df32f6424829" >
</g>
<g transform="matrix(0.07 0 0 0.07 382.02 122.8)" id="60cdd44f-027a-437a-92c4-c8d44c60ef9e" >
<path style="stroke: rgb(0,0,0); stroke-width: 0; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(50,50,42); fill-rule: nonzero; opacity: 1;" vector-effect="non-scaling-stroke" transform=" translate(-64, -64)" d="M 57.62 61.68 C 55.919999999999995 61.92 54.75 63.46 55 65.11 C 55.1668510745875 66.32380621250819 56.039448735907676 67.32218371690155 57.22 67.65 C 57.22 67.65 64.69 70.11 77.4 71.16000000000001 C 87.61000000000001 72.01 99.2 70.43 99.2 70.43 C 100.9 70.39 102.23 68.98 102.19 67.28 C 102.17037752125772 66.4652516996782 101.82707564255573 65.69186585376654 101.23597809465886 65.13079230830253 C 100.644880546762 64.56971876283853 99.85466451370849 64.26716220997277 99.03999999999999 64.29 C 98.83999999999999 64.29 98.63999999999999 64.33000000000001 98.42999999999999 64.37 C 98.42999999999999 64.37 87.08999999999999 65.78 77.88 65.02000000000001 C 65.72999999999999 64.05000000000001 59.11 61.83000000000001 59.11 61.83000000000001 C 58.63 61.670000000000016 58.1 61.59000000000001 57.62 61.670000000000016 Z M 57.62 46.46 C 55.919999999999995 46.7 54.75 48.24 55 49.89 C 55.1668510745875 51.10380621250818 56.039448735907676 52.10218371690154 57.22 52.43 C 57.22 52.43 64.69 54.89 77.4 55.94 C 87.61000000000001 56.79 99.2 55.21 99.2 55.21 C 100.9 55.17 102.23 53.76 102.19 52.06 C 102.17037752125772 51.245251699678214 101.82707564255573 50.47186585376654 101.23597809465886 49.91079230830253 C 100.644880546762 49.34971876283853 99.85466451370849 49.047162209972754 99.03999999999999 49.07 C 98.83999999999999 49.07 98.63999999999999 49.11 98.42999999999999 49.15 C 98.42999999999999 49.15 87.08999999999999 50.559999999999995 77.88 49.8 C 65.72999999999999 48.83 59.11 46.61 59.11 46.61 C 58.63 46.45 58.1 46.37 57.62 46.45 Z M 57.62 31.240000000000002 C 55.919999999999995 31.48 54.75 33.02 55 34.67 C 55.1668510745875 35.88380621250818 56.039448735907676 36.882183716901544 57.22 37.21 C 57.22 37.21 64.69 39.67 77.4 40.72 C 87.61000000000001 41.57 99.2 39.99 99.2 39.99 C 100.9 39.95 102.23 38.54 102.19 36.84 C 102.17037752125772 36.025251699678215 101.82707564255573 35.25186585376654 101.23597809465886 34.690792308302534 C 100.644880546762 34.12971876283853 99.85466451370849 33.827162209972755 99.03999999999999 33.85 C 98.83999999999999 33.85 98.63999999999999 33.89 98.42999999999999 33.93 C 98.42999999999999 33.93 87.08999999999999 35.339999999999996 77.88 34.58 C 65.72999999999999 33.61 59.11 31.389999999999997 59.11 31.389999999999997 C 58.63 31.229999999999997 58.1 31.189999999999998 57.62 31.229999999999997 Z M 57.62 16.060000000000002 C 55.919999999999995 16.3 54.75 17.840000000000003 55 19.490000000000002 C 55.1668510745875 20.703806212508187 56.039448735907676 21.702183716901544 57.22 22.03 C 57.22 22.03 64.69 24.490000000000002 77.4 25.54 C 87.61000000000001 26.39 99.2 24.81 99.2 24.81 C 100.9 24.77 102.23 23.36 102.19 21.66 C 102.17037752125772 20.84525169967821 101.82707564255573 20.07186585376654 101.23597809465886 19.510792308302534 C 100.644880546762 18.949718762838526 99.8546645137085 18.64716220997276 99.03999999999999 18.67 C 98.83999999999999 18.67 98.63999999999999 18.71 98.42999999999999 18.75 C 98.42999999999999 18.75 87.08999999999999 20.16 77.88 19.4 C 65.72999999999999 18.43 59.11 16.209999999999997 59.11 16.209999999999997 C 58.637850878541954 16.01924514007714 58.12188500879498 15.963839409097599 57.62 16.049999999999997 Z M 36.31 0 C 20.32 0.12 14.39 5.05 14.39 5.05 L 14.39 124.42 C 14.39 124.42 20.2 119.41 38.93 120.18 C 57.66 120.95000000000002 61.5 127.53 84.50999999999999 127.97000000000001 C 107.52 128.41000000000003 113.28999999999999 124.42000000000002 113.28999999999999 124.42000000000002 L 113.60999999999999 2.750000000000014 C 113.60999999999999 2.750000000000014 103.28 5.7 83.09 5.86 C 62.95 6.01 58.11 0.73 39.62 0.12 C 38.49 0.04 37.4 0 36.31 0 Z M 49.67 7.79 C 49.67 7.79 59.36 10.98 77.24000000000001 11.870000000000001 C 92.38000000000001 12.64 107.52000000000001 10.38 107.52000000000001 10.38 L 107.52000000000001 118.53 C 107.52000000000001 118.53 99.85000000000001 122.57000000000001 80.68 121.19 C 65.82000000000001 120.14 49.480000000000004 114.49 49.480000000000004 114.49 L 49.68000000000001 7.799999999999997 Z M 40.35 10.620000000000001 C 42.050000000000004 10.620000000000001 43.46 11.990000000000002 43.46 13.73 C 43.46 15.469999999999999 42.09 16.84 40.35 16.84 C 40.35 16.84 35.34 16.88 32.28 17.16 C 27.150000000000002 17.68 23.64 19.54 23.64 19.54 C 22.150000000000002 20.349999999999998 20.25 19.74 19.48 18.25 C 18.67 16.76 19.28 14.86 20.77 14.09 C 22.259999999999998 13.32 25.33 11.67 31.67 11.06 C 35.34 10.66 40.35 10.620000000000001 40.35 10.620000000000001 Z M 37.36 25.880000000000003 C 39.06 25.840000000000003 40.35 25.880000000000003 40.35 25.880000000000003 C 42.050000000000004 26.080000000000002 43.260000000000005 27.62 43.050000000000004 29.310000000000002 C 42.88374644848126 30.726609090871516 41.76660909087151 31.843746448481262 40.35 32.010000000000005 C 40.35 32.010000000000005 35.34 32.050000000000004 32.28 32.330000000000005 C 27.150000000000002 32.85000000000001 23.64 34.71000000000001 23.64 34.71000000000001 C 22.150000000000002 35.52000000000001 20.25 34.91000000000001 19.48 33.42000000000001 C 18.67 31.93000000000001 19.28 30.03000000000001 20.77 29.26000000000001 C 20.77 29.26000000000001 25.33 26.84000000000001 31.67 26.230000000000008 C 33.53 25.99000000000001 35.67 25.910000000000007 37.36 25.870000000000008 Z M 40.35 41.06 C 42.050000000000004 41.06 43.46 42.43 43.46 44.17 C 43.46 45.910000000000004 42.09 47.28 40.35 47.28 C 40.35 47.28 35.34 47.24 32.28 47.56 C 27.150000000000002 48.080000000000005 23.64 49.940000000000005 23.64 49.940000000000005 C 22.150000000000002 50.75000000000001 20.25 50.14000000000001 19.48 48.650000000000006 C 18.67 47.160000000000004 19.28 45.260000000000005 20.77 44.49000000000001 C 20.77 44.49000000000001 25.33 42.07000000000001 31.67 41.46000000000001 C 35.34 41.02000000000001 40.35 41.06000000000001 40.35 41.06000000000001 Z" stroke-linecap="round" />
</g>
<g transform="matrix(0.07 0 0 0.07 396.05 123.14)" style="" id="eb0df536-c517-4781-a7c0-3f84cd77c272" >
<text xml:space="preserve" font-family="Lato" font-size="40" font-style="normal" font-weight="400" style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(0,0,0); fill-rule: nonzero; opacity: 1; white-space: pre;" ><tspan x="-130" y="12.57" >Read The Docs</tspan></text>
</g>
<g transform="matrix(0.28 0 0 0.28 27.88 36)" id="7b9eddb9-1652-4040-9437-2ab90652d624" >
<path style="stroke: rgb(0,0,0); stroke-width: 0; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(50,50,42); fill-rule: nonzero; opacity: 1;" vector-effect="non-scaling-stroke" transform=" translate(-64, -64)" d="M 57.62 61.68 C 55.919999999999995 61.92 54.75 63.46 55 65.11 C 55.1668510745875 66.32380621250819 56.039448735907676 67.32218371690155 57.22 67.65 C 57.22 67.65 64.69 70.11 77.4 71.16000000000001 C 87.61000000000001 72.01 99.2 70.43 99.2 70.43 C 100.9 70.39 102.23 68.98 102.19 67.28 C 102.17037752125772 66.4652516996782 101.82707564255573 65.69186585376654 101.23597809465886 65.13079230830253 C 100.644880546762 64.56971876283853 99.85466451370849 64.26716220997277 99.03999999999999 64.29 C 98.83999999999999 64.29 98.63999999999999 64.33000000000001 98.42999999999999 64.37 C 98.42999999999999 64.37 87.08999999999999 65.78 77.88 65.02000000000001 C 65.72999999999999 64.05000000000001 59.11 61.83000000000001 59.11 61.83000000000001 C 58.63 61.670000000000016 58.1 61.59000000000001 57.62 61.670000000000016 Z M 57.62 46.46 C 55.919999999999995 46.7 54.75 48.24 55 49.89 C 55.1668510745875 51.10380621250818 56.039448735907676 52.10218371690154 57.22 52.43 C 57.22 52.43 64.69 54.89 77.4 55.94 C 87.61000000000001 56.79 99.2 55.21 99.2 55.21 C 100.9 55.17 102.23 53.76 102.19 52.06 C 102.17037752125772 51.245251699678214 101.82707564255573 50.47186585376654 101.23597809465886 49.91079230830253 C 100.644880546762 49.34971876283853 99.85466451370849 49.047162209972754 99.03999999999999 49.07 C 98.83999999999999 49.07 98.63999999999999 49.11 98.42999999999999 49.15 C 98.42999999999999 49.15 87.08999999999999 50.559999999999995 77.88 49.8 C 65.72999999999999 48.83 59.11 46.61 59.11 46.61 C 58.63 46.45 58.1 46.37 57.62 46.45 Z M 57.62 31.240000000000002 C 55.919999999999995 31.48 54.75 33.02 55 34.67 C 55.1668510745875 35.88380621250818 56.039448735907676 36.882183716901544 57.22 37.21 C 57.22 37.21 64.69 39.67 77.4 40.72 C 87.61000000000001 41.57 99.2 39.99 99.2 39.99 C 100.9 39.95 102.23 38.54 102.19 36.84 C 102.17037752125772 36.025251699678215 101.82707564255573 35.25186585376654 101.23597809465886 34.690792308302534 C 100.644880546762 34.12971876283853 99.85466451370849 33.827162209972755 99.03999999999999 33.85 C 98.83999999999999 33.85 98.63999999999999 33.89 98.42999999999999 33.93 C 98.42999999999999 33.93 87.08999999999999 35.339999999999996 77.88 34.58 C 65.72999999999999 33.61 59.11 31.389999999999997 59.11 31.389999999999997 C 58.63 31.229999999999997 58.1 31.189999999999998 57.62 31.229999999999997 Z M 57.62 16.060000000000002 C 55.919999999999995 16.3 54.75 17.840000000000003 55 19.490000000000002 C 55.1668510745875 20.703806212508187 56.039448735907676 21.702183716901544 57.22 22.03 C 57.22 22.03 64.69 24.490000000000002 77.4 25.54 C 87.61000000000001 26.39 99.2 24.81 99.2 24.81 C 100.9 24.77 102.23 23.36 102.19 21.66 C 102.17037752125772 20.84525169967821 101.82707564255573 20.07186585376654 101.23597809465886 19.510792308302534 C 100.644880546762 18.949718762838526 99.8546645137085 18.64716220997276 99.03999999999999 18.67 C 98.83999999999999 18.67 98.63999999999999 18.71 98.42999999999999 18.75 C 98.42999999999999 18.75 87.08999999999999 20.16 77.88 19.4 C 65.72999999999999 18.43 59.11 16.209999999999997 59.11 16.209999999999997 C 58.637850878541954 16.01924514007714 58.12188500879498 15.963839409097599 57.62 16.049999999999997 Z M 36.31 0 C 20.32 0.12 14.39 5.05 14.39 5.05 L 14.39 124.42 C 14.39 124.42 20.2 119.41 38.93 120.18 C 57.66 120.95000000000002 61.5 127.53 84.50999999999999 127.97000000000001 C 107.52 128.41000000000003 113.28999999999999 124.42000000000002 113.28999999999999 124.42000000000002 L 113.60999999999999 2.750000000000014 C 113.60999999999999 2.750000000000014 103.28 5.7 83.09 5.86 C 62.95 6.01 58.11 0.73 39.62 0.12 C 38.49 0.04 37.4 0 36.31 0 Z M 49.67 7.79 C 49.67 7.79 59.36 10.98 77.24000000000001 11.870000000000001 C 92.38000000000001 12.64 107.52000000000001 10.38 107.52000000000001 10.38 L 107.52000000000001 118.53 C 107.52000000000001 118.53 99.85000000000001 122.57000000000001 80.68 121.19 C 65.82000000000001 120.14 49.480000000000004 114.49 49.480000000000004 114.49 L 49.68000000000001 7.799999999999997 Z M 40.35 10.620000000000001 C 42.050000000000004 10.620000000000001 43.46 11.990000000000002 43.46 13.73 C 43.46 15.469999999999999 42.09 16.84 40.35 16.84 C 40.35 16.84 35.34 16.88 32.28 17.16 C 27.150000000000002 17.68 23.64 19.54 23.64 19.54 C 22.150000000000002 20.349999999999998 20.25 19.74 19.48 18.25 C 18.67 16.76 19.28 14.86 20.77 14.09 C 22.259999999999998 13.32 25.33 11.67 31.67 11.06 C 35.34 10.66 40.35 10.620000000000001 40.35 10.620000000000001 Z M 37.36 25.880000000000003 C 39.06 25.840000000000003 40.35 25.880000000000003 40.35 25.880000000000003 C 42.050000000000004 26.080000000000002 43.260000000000005 27.62 43.050000000000004 29.310000000000002 C 42.88374644848126 30.726609090871516 41.76660909087151 31.843746448481262 40.35 32.010000000000005 C 40.35 32.010000000000005 35.34 32.050000000000004 32.28 32.330000000000005 C 27.150000000000002 32.85000000000001 23.64 34.71000000000001 23.64 34.71000000000001 C 22.150000000000002 35.52000000000001 20.25 34.91000000000001 19.48 33.42000000000001 C 18.67 31.93000000000001 19.28 30.03000000000001 20.77 29.26000000000001 C 20.77 29.26000000000001 25.33 26.84000000000001 31.67 26.230000000000008 C 33.53 25.99000000000001 35.67 25.910000000000007 37.36 25.870000000000008 Z M 40.35 41.06 C 42.050000000000004 41.06 43.46 42.43 43.46 44.17 C 43.46 45.910000000000004 42.09 47.28 40.35 47.28 C 40.35 47.28 35.34 47.24 32.28 47.56 C 27.150000000000002 48.080000000000005 23.64 49.940000000000005 23.64 49.940000000000005 C 22.150000000000002 50.75000000000001 20.25 50.14000000000001 19.48 48.650000000000006 C 18.67 47.160000000000004 19.28 45.260000000000005 20.77 44.49000000000001 C 20.77 44.49000000000001 25.33 42.07000000000001 31.67 41.46000000000001 C 35.34 41.02000000000001 40.35 41.06000000000001 40.35 41.06000000000001 Z" stroke-linecap="round" />
</g>
<g transform="matrix(0.9 0 0 0.9 94 36)" style="" id="385bde16-f9fa-4222-bfea-1d5d5efcf730" >
<text xml:space="preserve" font-family="Lato" font-size="15" font-style="normal" font-weight="100" style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(0,0,0); fill-rule: nonzero; opacity: 1; white-space: pre;" ><tspan x="-48.68" y="4.71" >Read The Docs</tspan></text>
</g>
</svg>
\ No newline at end of file
Changelog
=========
- 2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a
`localized deployment version <projects/web_demo/README.md>`__ of the
`online
demo <https://opendatalab.com/OpenSourceTools/Extractor/PDF/>`__ and
the `front-end interface <projects/web/README.md>`__.
- 2024/09/09: Version 0.8.0 released, supporting fast deployment with
Dockerfile, and launching demos on Huggingface and Modelscope.
- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table
recognition option
- 2024/08/09: Version 0.7.0b1 released, simplified installation
process, added table recognition functionality
- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict
issues and installation documentation
- 2024/07/05: Initial open-source release
.. warning::
fix ``localized deployment version`` and ``front-end interface``
...@@ -74,3 +74,15 @@ CUDA version used by Paddle needs to be upgraded. ...@@ -74,3 +74,15 @@ CUDA version used by Paddle needs to be upgraded.
pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
Reference: https://github.com/opendatalab/MinerU/issues/558 Reference: https://github.com/opendatalab/MinerU/issues/558
7. On some Linux servers, the program immediately reports an error ``Illegal instruction (core dumped)``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This might be because the server's CPU does not support the AVX/AVX2
instruction set, or the CPU itself supports it but has been disabled by
the system administrator. You can try contacting the system
administrator to remove the restriction or change to a different server.
References: https://github.com/opendatalab/MinerU/issues/591 ,
https://github.com/opendatalab/MinerU/issues/736
Known Issues Known Issues
============ ============
- Reading order is based on the model’s sorting of text distribution in - Reading order is determined by the model based on the spatial
space, which may become disordered under extremely complex layouts. distribution of readable content, and may be out of order in some
areas under extremely complex layouts.
- Vertical text is not supported. - Vertical text is not supported.
- Tables of contents and lists are recognized through rules; a few - Tables of contents and lists are recognized through rules, and some
uncommon list formats may not be identified. uncommon list formats may not be recognized.
- Only one level of headings is supported; hierarchical heading levels - Only one level of headings is supported; hierarchical headings are
are currently not supported. not currently supported.
- Code blocks are not yet supported in the layout model. - Code blocks are not yet supported in the layout model.
- Comic books, art books, elementary school textbooks, and exercise - Comic books, art albums, primary school textbooks, and exercises
books are not well-parsed yet cannot be parsed well.
- Enabling OCR may produce better results in PDFs with a high density - Table recognition may result in row/column recognition errors in
of formulas complex tables.
- If you are processing PDFs with a large number of formulas, it is - OCR recognition may produce inaccurate characters in PDFs of
strongly recommended to enable the OCR function. When using PyMuPDF lesser-known languages (e.g., diacritical marks in Latin script,
to extract text, overlapping text lines can occur, leading to easily confused characters in Arabic script).
inaccurate formula insertion positions. - Some formulas may not render correctly in Markdown.
\ No newline at end of file
...@@ -7,4 +7,3 @@ ...@@ -7,4 +7,3 @@
api/read_api api/read_api
api/schemas api/schemas
api/io api/io
api/classes
\ No newline at end of file
Class Hierarchy
===============
.. inheritance-diagram:: magic_pdf.data.io.base magic_pdf.data.io.http magic_pdf.data.io.s3
:parts: 2
.. inheritance-diagram:: magic_pdf.data.dataset
:parts: 2
.. inheritance-diagram:: magic_pdf.data.data_reader_writer.base magic_pdf.data.data_reader_writer.filebase magic_pdf.data.data_reader_writer.multi_bucket_s3
:parts: 2
...@@ -95,7 +95,7 @@ language = 'en' ...@@ -95,7 +95,7 @@ language = 'en'
html_theme = 'sphinx_book_theme' html_theme = 'sphinx_book_theme'
html_logo = '_static/image/logo.png' html_logo = '_static/image/logo.png'
html_theme_options = { html_theme_options = {
'path_to_docs': 'docs/en', 'path_to_docs': 'next_docs/en',
'repository_url': 'https://github.com/opendatalab/MinerU', 'repository_url': 'https://github.com/opendatalab/MinerU',
'use_repository_button': True, 'use_repository_button': True,
} }
......
...@@ -46,20 +46,29 @@ the relevant PDF**. ...@@ -46,20 +46,29 @@ the relevant PDF**.
Key Features Key Features
------------ ------------
- Removes elements such as headers, footers, footnotes, and page - Remove headers, footers, footnotes, page numbers, etc., to ensure
numbers while maintaining semantic continuity semantic coherence.
- Outputs text in a human-readable order from multi-column documents - Output text in human-readable order, suitable for single-column,
- Retains the original structure of the document, including titles, multi-column, and complex layouts.
paragraphs, and lists - Preserve the structure of the original document, including headings,
- Extracts images, image captions, tables, and table captions paragraphs, lists, etc.
- Automatically recognizes formulas in the document and converts them - Extract images, image descriptions, tables, table titles, and
to LaTeX footnotes.
- Automatically recognizes tables in the document and converts them to - Automatically recognize and convert formulas in the document to LaTeX
LaTeX format.
- Automatically detects and enables OCR for corrupted PDFs - Automatically recognize and convert tables in the document to LaTeX
- Supports both CPU and GPU environments or HTML format.
- Supports Windows, Linux, and Mac platforms - Automatically detect scanned PDFs and garbled PDFs and enable OCR
functionality.
- OCR supports detection and recognition of 84 languages.
- Supports multiple output formats, such as multimodal and NLP
Markdown, JSON sorted by reading order, and rich intermediate
formats.
- Supports various visualization results, including layout
visualization and span visualization, for efficient confirmation of
output quality.
- Supports both CPU and GPU environments.
- Compatible with Windows, Linux, and Mac platforms.
User Guide User Guide
------------- -------------
...@@ -91,14 +100,6 @@ Additional Notes ...@@ -91,14 +100,6 @@ Additional Notes
additional_notes/known_issues additional_notes/known_issues
additional_notes/faq additional_notes/faq
additional_notes/changelog
additional_notes/glossary additional_notes/glossary
Projects
---------
.. toctree::
:maxdepth: 1
:caption: Projects
projects
\ No newline at end of file
llama_index_rag
===============
gradio_app
============
other projects
===============
\ No newline at end of file
...@@ -87,6 +87,8 @@ Read Examples ...@@ -87,6 +87,8 @@ Read Examples
.. code:: python .. code:: python
from magic_pdf.data.data_reader_writer import *
# file based related # file based related
file_based_reader1 = FileBasedDataReader('') file_based_reader1 = FileBasedDataReader('')
...@@ -142,6 +144,8 @@ Write Examples ...@@ -142,6 +144,8 @@ Write Examples
.. code:: python .. code:: python
from magic_pdf.data.data_reader_writer import *
# file based related # file based related
file_based_writer1 = FileBasedDataWriter('') file_based_writer1 = FileBasedDataWriter('')
...@@ -201,4 +205,4 @@ Write Examples ...@@ -201,4 +205,4 @@ Write Examples
s3_writer1.write('s3://test_bucket/efg', '123'.encode()) s3_writer1.write('s3://test_bucket/efg', '123'.encode())
Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/data_reader_writer` for more details Check :doc:`../../api/data_reader_writer` for more details
...@@ -36,5 +36,5 @@ Extract chars via third-party library, currently we use ``pymupdf``. ...@@ -36,5 +36,5 @@ Extract chars via third-party library, currently we use ``pymupdf``.
Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/dataset` for more details Check :doc:`../../api/dataset` for more details
...@@ -21,5 +21,5 @@ if MinerU have not provide the suitable classes. It is easy to implement new cla ...@@ -21,5 +21,5 @@ if MinerU have not provide the suitable classes. It is easy to implement new cla
def write(self, path: str, data: bytes) -> None: def write(self, path: str, data: bytes) -> None:
pass pass
Check :doc:`../../api/classes` for more intuitions or check :doc:`../../api/io` for more details Check :doc:`../../api/io` for more details
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment