Unverified Commit 4bb54393 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1427 from opendatalab/release-1.0.0

Release 1.0.0
parents 04f084ac 1c9f9942
# Copyright (c) Opendatalab. All rights reserved.
import json
from loguru import logger
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from openai import OpenAI
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
1. 修正渲染或编译错误:
- Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
- 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
2. 保留原始信息:
- 保留原始公式中的所有重要信息
- 不要添加任何原始公式中没有的新信息
IMPORTANT:请仅返回修正后的公式,不要包含任何介绍、解释或元数据。
LaTeX recognition result:
$FORMULA
Your corrected result:
"""
text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容:
1. 修正OCR引起的拼写错误和错误:
- 修正常见的OCR错误(例如,'rn' 被误读为 'm')
- 使用上下文和常识进行修正
- 只修正明显的错误,不要不必要的修改内容
- 不要添加额外的句号或其他不必要的标点符号
2. 保持原始结构:
- 保留所有标题和子标题
3. 保留原始内容:
- 保留原始文本中的所有重要信息
- 不要添加任何原始文本中没有的新信息
- 保留段落之间的换行符
4. 保持连贯性:
- 确保内容与前文顺畅连接
- 适当处理在句子中间开始或结束的文本
5. 修正行内公式:
- 去除行内公式前后多余的空格
- 修正公式中的OCR错误
- 确保公式能够通过KaTeX渲染
6. 修正全角字符
- 修正全角标点符号为半角标点符号
- 修正全角字母为半角字母
- 修正全角数字为半角数字
IMPORTANT:请仅返回修正后的文本,保留所有原始格式,包括换行符。不要包含任何介绍、解释或元数据。
Previous context:
Current chunk to process:
Corrected text:
"""
def llm_aided_formula(pdf_info_dict, formula_aided_config):
pass
def llm_aided_text(pdf_info_dict, text_aided_config):
pass
def llm_aided_title(pdf_info_dict, title_aided_config):
client = OpenAI(
api_key=title_aided_config["api_key"],
base_url=title_aided_config["base_url"],
)
title_dict = {}
origin_title_list = []
i = 0
for page_num, page in pdf_info_dict.items():
blocks = page["para_blocks"]
for block in blocks:
if block["type"] == "title":
origin_title_list.append(block)
title_text = merge_para_with_text(block)
title_dict[f"{i}"] = title_text
i += 1
# logger.info(f"Title list: {title_dict}")
title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构:
1. 保留原始内容:
- 输入的字典中所有元素都是有效的,不能删除字典中的任何元素
- 请务必保证输出的字典中元素的数量和输入的数量一致
2. 保持字典内key-value的对应关系不变
3. 优化层次结构:
- 为每个标题元素添加适当的层次结构
- 标题层级应具有连续性,不能跳过某一层级
- 标题层级最多为4级,不要添加过多的层级
- 优化后的标题为一个整数,代表该标题的层级
IMPORTANT:
请直接返回优化过的由标题层级组成的json,返回的json不需要格式化。
Input title list:
{title_dict}
Corrected title list:
"""
completion = client.chat.completions.create(
model=title_aided_config["model"],
messages=[
{'role': 'user', 'content': title_optimize_prompt}],
temperature=0.7,
)
json_completion = json.loads(completion.choices[0].message.content)
# logger.info(f"Title completion: {json_completion}")
# logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
if len(json_completion) == len(title_dict):
try:
for i, origin_title_block in enumerate(origin_title_list):
origin_title_block["level"] = int(json_completion[str(i)])
except Exception as e:
logger.exception(e)
else:
logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
...@@ -33,6 +33,14 @@ def remove_overlaps_low_confidence_spans(spans): ...@@ -33,6 +33,14 @@ def remove_overlaps_low_confidence_spans(spans):
return spans, dropped_spans return spans, dropped_spans
def check_chars_is_overlap_in_span(chars):
for i in range(len(chars)):
for j in range(i + 1, len(chars)):
if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
return True
return False
def remove_overlaps_min_spans(spans): def remove_overlaps_min_spans(spans):
dropped_spans = [] dropped_spans = []
# 删除重叠spans中较小的那些 # 删除重叠spans中较小的那些
......
...@@ -70,7 +70,7 @@ def _remove_overlap_between_bboxes(arr): ...@@ -70,7 +70,7 @@ def _remove_overlap_between_bboxes(arr):
res[i] = None res[i] = None
else: else:
keeps[idx] = False keeps[idx] = False
drop_reasons.append(drop_reasons) drop_reasons.append(drop_reason)
if keeps[idx]: if keeps[idx]:
res[idx] = v res[idx] = v
return res, drop_reasons return res, drop_reasons
......
from abc import ABC, abstractmethod
class AbsReaderWriter(ABC):
MODE_TXT = "text"
MODE_BIN = "binary"
@abstractmethod
def read(self, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def write(self, content: str, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
raise NotImplementedError
import os
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path
self.encoding = encoding
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if not os.path.exists(abspath):
logger.error(f"file {abspath} not exists")
raise Exception(f"file {abspath} no exists")
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "r", encoding=self.encoding) as f:
return f.read()
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "rb") as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
f.write(content)
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "wb") as f:
f.write(content)
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_offset(self, path: str, offset=0, limit=None):
abspath = path
if not os.path.isabs(path):
abspath = os.path.join(self.path, path)
with open(abspath, "rb") as f:
f.seek(offset)
return f.read(limit)
if __name__ == "__main__":
if 0:
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
if 1:
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
content_bin = drw.read_offset("1.txt")
assert content_bin == b"ABCD!"
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
assert content_bin == b"BC"
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
class S3ReaderWriter(AbsReaderWriter):
def __init__(
self,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = "auto",
parent_path: str = "",
):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={"addressing_style": addressing_style},
retries={"max_attempts": 5, "mode": "standard"},
),
)
return s3_client
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
bucket_name, key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=key)
body = res["Body"].read()
if mode == AbsReaderWriter.MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == AbsReaderWriter.MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
if mode == AbsReaderWriter.MODE_TXT:
body = content.encode(encoding) # Encode text data as bytes
elif mode == AbsReaderWriter.MODE_BIN:
body = content
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
logger.info(f"内容已写入 {s3_path} ")
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
if path.startswith("s3://"):
s3_path = path
else:
s3_path = join_path(self.path, path)
bucket_name, key = parse_bucket_key(s3_path)
range_header = (
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
)
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
return res["Body"].read()
if __name__ == "__main__":
if 0:
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = "auto"
bucket_name = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
)
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_TXT,
)
# Read text data from S3
text_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
)
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_BIN,
)
# Read binary data from S3
binary_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
)
logger.info(f"Read binary data from S3: {binary_data_read}")
# Range Read text data from S3
binary_data_read = s3_reader_writer.read_offset(
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
)
logger.info(f"Read binary data from S3: {binary_data_read}")
if 1:
import os
import json
ak = os.getenv("AK", "")
sk = os.getenv("SK", "")
endpoint_url = os.getenv("ENDPOINT", "")
bucket = os.getenv("S3_BUCKET", "")
prefix = os.getenv("S3_PREFIX", "")
key_basename = os.getenv("S3_KEY_BASENAME", "")
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
)
content_bin = s3_reader_writer.read_offset(key_basename)
assert content_bin[:10] == b'{"track_id'
assert content_bin[-10:] == b'r":null}}\n'
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
jso = json.dumps(content_bin.decode("utf-8"))
print(jso)
import os import os
from pathlib import Path import shutil
import tempfile
import click import click
import fitz
from loguru import logger from loguru import logger
from pathlib import Path
import magic_pdf.model as model_config import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods from magic_pdf.tools.common import do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpeg', '.jpg']
@click.command() @click.command()
...@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods ...@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
'path', 'path',
type=click.Path(exists=True), type=click.Path(exists=True),
required=True, required=True,
help='local pdf filepath or directory', help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
) )
@click.option( @click.option(
'-o', '-o',
...@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full' model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def read_fn(path): def parse_doc(doc_path: Path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str):
try: try:
file_name = str(Path(doc_path).stem) file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path) pdf_data = read_fn(doc_path)
...@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id): ...@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
logger.exception(e) logger.exception(e)
if os.path.isdir(path): if os.path.isdir(path):
for doc_path in Path(path).glob('*.pdf'): for doc_path in Path(path).glob('*'):
parse_doc(doc_path) if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
else: else:
parse_doc(path) parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -9,8 +9,9 @@ from magic_pdf.config.enums import SupportedPdfParseMethod ...@@ -9,8 +9,9 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.draw_bbox import draw_char_bbox
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult from magic_pdf.operators.models import InferenceResult
# from io import BytesIO # from io import BytesIO
# from pypdf import PdfReader, PdfWriter # from pypdf import PdfReader, PdfWriter
...@@ -83,6 +84,7 @@ def do_parse( ...@@ -83,6 +84,7 @@ def do_parse(
f_make_md_mode=MakeMode.MM_MD, f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False, f_draw_model_bbox=False,
f_draw_line_sort_bbox=False, f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
start_page_id=0, start_page_id=0,
end_page_id=None, end_page_id=None,
lang=None, lang=None,
...@@ -94,9 +96,7 @@ def do_parse( ...@@ -94,9 +96,7 @@ def do_parse(
logger.warning('debug mode is on') logger.warning('debug mode is on')
f_draw_model_bbox = True f_draw_model_bbox = True
f_draw_line_sort_bbox = True f_draw_line_sort_bbox = True
# f_draw_char_bbox = True
if lang == '':
lang = None
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf( pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes, start_page_id, end_page_id pdf_bytes, start_page_id, end_page_id
...@@ -109,7 +109,7 @@ def do_parse( ...@@ -109,7 +109,7 @@ def do_parse(
) )
image_dir = str(os.path.basename(local_image_dir)) image_dir = str(os.path.basename(local_image_dir))
ds = PymuDocDataset(pdf_bytes) ds = PymuDocDataset(pdf_bytes, lang=lang)
if len(model_list) == 0: if len(model_list) == 0:
if model_config.__use_inside_model__: if model_config.__use_inside_model__:
...@@ -118,50 +118,50 @@ def do_parse( ...@@ -118,50 +118,50 @@ def do_parse(
infer_result = ds.apply( infer_result = ds.apply(
doc_analyze, doc_analyze,
ocr=False, ocr=False,
lang=lang, lang=ds._lang,
layout_model=layout_model, layout_model=layout_model,
formula_enable=formula_enable, formula_enable=formula_enable,
table_enable=table_enable, table_enable=table_enable,
) )
pipe_result = infer_result.pipe_txt_mode( pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=ds._lang
) )
else: else:
infer_result = ds.apply( infer_result = ds.apply(
doc_analyze, doc_analyze,
ocr=True, ocr=True,
lang=lang, lang=ds._lang,
layout_model=layout_model, layout_model=layout_model,
formula_enable=formula_enable, formula_enable=formula_enable,
table_enable=table_enable, table_enable=table_enable,
) )
pipe_result = infer_result.pipe_ocr_mode( pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=ds._lang
) )
elif parse_method == 'txt': elif parse_method == 'txt':
infer_result = ds.apply( infer_result = ds.apply(
doc_analyze, doc_analyze,
ocr=False, ocr=False,
lang=lang, lang=ds._lang,
layout_model=layout_model, layout_model=layout_model,
formula_enable=formula_enable, formula_enable=formula_enable,
table_enable=table_enable, table_enable=table_enable,
) )
pipe_result = infer_result.pipe_txt_mode( pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=ds._lang
) )
elif parse_method == 'ocr': elif parse_method == 'ocr':
infer_result = ds.apply( infer_result = ds.apply(
doc_analyze, doc_analyze,
ocr=True, ocr=True,
lang=lang, lang=ds._lang,
layout_model=layout_model, layout_model=layout_model,
formula_enable=formula_enable, formula_enable=formula_enable,
table_enable=table_enable, table_enable=table_enable,
) )
pipe_result = infer_result.pipe_ocr_mode( pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=ds._lang
) )
else: else:
logger.error('unknown parse method') logger.error('unknown parse method')
...@@ -170,19 +170,26 @@ def do_parse( ...@@ -170,19 +170,26 @@ def do_parse(
logger.error('need model list input') logger.error('need model list input')
exit(2) exit(2)
else: else:
infer_result = InferenceResult(model_list, ds) infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr': if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode( pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=ds._lang
) )
elif parse_method == 'txt': elif parse_method == 'txt':
pipe_result = infer_result.pipe_txt_mode( pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang image_writer, debug_mode=True, lang=ds._lang
) )
else: else:
pipe_result = infer_result.pipe_auto_mode( if ds.classify() == SupportedPdfParseMethod.TXT:
image_writer, debug_mode=True, lang=lang pipe_result = infer_result.pipe_txt_mode(
) image_writer, debug_mode=True, lang=ds._lang
)
else:
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=ds._lang
)
if f_draw_model_bbox: if f_draw_model_bbox:
infer_result.draw_model( infer_result.draw_model(
...@@ -201,6 +208,9 @@ def do_parse( ...@@ -201,6 +208,9 @@ def do_parse(
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf') os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
) )
if f_draw_char_bbox:
draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
if f_dump_md: if f_dump_md:
pipe_result.dump_md( pipe_result.dump_md(
md_writer, md_writer,
......
"""用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
def parse_txt_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析文本类pdf."""
pdf_info_dict = parse_pdf_by_txt(
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
lang=lang,
)
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
pdf_info_dict['_version_name'] = __version__
if lang is not None:
pdf_info_dict['_lang'] = lang
return pdf_info_dict
def parse_ocr_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析ocr类pdf."""
pdf_info_dict = parse_pdf_by_ocr(
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
lang=lang,
)
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
pdf_info_dict['_version_name'] = __version__
if lang is not None:
pdf_info_dict['_lang'] = lang
return pdf_info_dict
def parse_union_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""ocr和文本混合的pdf,全部解析出来."""
def parse_pdf(method):
try:
return method(
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
lang=lang,
)
except Exception as e:
logger.exception(e)
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
if len(model_list) == 0:
layout_model = kwargs.get('layout_model', None)
formula_enable = kwargs.get('formula_enable', None)
table_enable = kwargs.get('table_enable', None)
infer_res = doc_analyze(
dataset,
ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
model_list = infer_res.get_infer_res()
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
else:
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
else:
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
pdf_info_dict['_version_name'] = __version__
if lang is not None:
pdf_info_dict['_lang'] = lang
return pdf_info_dict
import os
import subprocess
from pathlib import Path
class ConvertToPdfError(Exception):
def __init__(self, msg):
self.msg = msg
super().__init__(self.msg)
def convert_file_to_pdf(input_path, output_dir):
if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True)
cmd = [
'soffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(input_path)
]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode())
This diff is collapsed.
This diff is collapsed.
...@@ -4,8 +4,11 @@ Glossary ...@@ -4,8 +4,11 @@ Glossary
=========== ===========
1. jsonl 1. jsonl
TODO: add description Newline-delimited (\n), and each line must be a valid, independent JSON object.
Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
2. magic-pdf.json
TODO
2. magic-pdf.json
TODO: add description
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
Model Api Model Api
========== ==========
.. autoclass:: magic_pdf.model.InferenceResultBase .. autoclass:: magic_pdf.operators.InferenceResultBase
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance: :show-inheritance:
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
Pipeline Api Pipeline Api
============= =============
.. autoclass:: magic_pdf.pipe.operators.PipeResult .. autoclass:: magic_pdf.operators.pipes.PipeResult
:members: :members:
:inherited-members: :inherited-members:
:show-inheritance: :show-inheritance:
\ No newline at end of file
...@@ -70,6 +70,12 @@ Key Features ...@@ -70,6 +70,12 @@ Key Features
- Supports both CPU and GPU environments. - Supports both CPU and GPU environments.
- Compatible with Windows, Linux, and Mac platforms. - Compatible with Windows, Linux, and Mac platforms.
.. tip::
Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
User Guide User Guide
------------- -------------
.. toctree:: .. toctree::
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment