Unverified Commit 4bb54393 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1427 from opendatalab/release-1.0.0

Release 1.0.0
parents 04f084ac 1c9f9942
# Copyright (c) Opendatalab. All rights reserved.
import json
from loguru import logger
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
from openai import OpenAI
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
formula_optimize_prompt = """请根据以下指南修正LaTeX公式的错误,确保公式能够渲染且符合原始内容:
1. 修正渲染或编译错误:
- Some syntax errors such as mismatched/missing/extra tokens. Your task is to fix these syntax errors and make sure corrected results conform to latex math syntax principles.
- 包含KaTeX不支持的关键词等原因导致的无法编译或渲染的错误
2. 保留原始信息:
- 保留原始公式中的所有重要信息
- 不要添加任何原始公式中没有的新信息
IMPORTANT:请仅返回修正后的公式,不要包含任何介绍、解释或元数据。
LaTeX recognition result:
$FORMULA
Your corrected result:
"""
text_optimize_prompt = f"""请根据以下指南修正OCR引起的错误,确保文本连贯并符合原始内容:
1. 修正OCR引起的拼写错误和错误:
- 修正常见的OCR错误(例如,'rn' 被误读为 'm')
- 使用上下文和常识进行修正
- 只修正明显的错误,不要不必要的修改内容
- 不要添加额外的句号或其他不必要的标点符号
2. 保持原始结构:
- 保留所有标题和子标题
3. 保留原始内容:
- 保留原始文本中的所有重要信息
- 不要添加任何原始文本中没有的新信息
- 保留段落之间的换行符
4. 保持连贯性:
- 确保内容与前文顺畅连接
- 适当处理在句子中间开始或结束的文本
5. 修正行内公式:
- 去除行内公式前后多余的空格
- 修正公式中的OCR错误
- 确保公式能够通过KaTeX渲染
6. 修正全角字符
- 修正全角标点符号为半角标点符号
- 修正全角字母为半角字母
- 修正全角数字为半角数字
IMPORTANT:请仅返回修正后的文本,保留所有原始格式,包括换行符。不要包含任何介绍、解释或元数据。
Previous context:
Current chunk to process:
Corrected text:
"""
def llm_aided_formula(pdf_info_dict, formula_aided_config):
pass
def llm_aided_text(pdf_info_dict, text_aided_config):
pass
def llm_aided_title(pdf_info_dict, title_aided_config):
client = OpenAI(
api_key=title_aided_config["api_key"],
base_url=title_aided_config["base_url"],
)
title_dict = {}
origin_title_list = []
i = 0
for page_num, page in pdf_info_dict.items():
blocks = page["para_blocks"]
for block in blocks:
if block["type"] == "title":
origin_title_list.append(block)
title_text = merge_para_with_text(block)
title_dict[f"{i}"] = title_text
i += 1
# logger.info(f"Title list: {title_dict}")
title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构:
1. 保留原始内容:
- 输入的字典中所有元素都是有效的,不能删除字典中的任何元素
- 请务必保证输出的字典中元素的数量和输入的数量一致
2. 保持字典内key-value的对应关系不变
3. 优化层次结构:
- 为每个标题元素添加适当的层次结构
- 标题层级应具有连续性,不能跳过某一层级
- 标题层级最多为4级,不要添加过多的层级
- 优化后的标题为一个整数,代表该标题的层级
IMPORTANT:
请直接返回优化过的由标题层级组成的json,返回的json不需要格式化。
Input title list:
{title_dict}
Corrected title list:
"""
completion = client.chat.completions.create(
model=title_aided_config["model"],
messages=[
{'role': 'user', 'content': title_optimize_prompt}],
temperature=0.7,
)
json_completion = json.loads(completion.choices[0].message.content)
# logger.info(f"Title completion: {json_completion}")
# logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
if len(json_completion) == len(title_dict):
try:
for i, origin_title_block in enumerate(origin_title_list):
origin_title_block["level"] = int(json_completion[str(i)])
except Exception as e:
logger.exception(e)
else:
logger.error("The number of titles in the optimized result is not equal to the number of titles in the input.")
......@@ -33,6 +33,14 @@ def remove_overlaps_low_confidence_spans(spans):
return spans, dropped_spans
def check_chars_is_overlap_in_span(chars):
for i in range(len(chars)):
for j in range(i + 1, len(chars)):
if calculate_iou(chars[i]['bbox'], chars[j]['bbox']) > 0.9:
return True
return False
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
......
......@@ -70,7 +70,7 @@ def _remove_overlap_between_bboxes(arr):
res[i] = None
else:
keeps[idx] = False
drop_reasons.append(drop_reasons)
drop_reasons.append(drop_reason)
if keeps[idx]:
res[idx] = v
return res, drop_reasons
......
from abc import ABC, abstractmethod
class AbsReaderWriter(ABC):
MODE_TXT = "text"
MODE_BIN = "binary"
@abstractmethod
def read(self, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def write(self, content: str, path: str, mode=MODE_TXT):
raise NotImplementedError
@abstractmethod
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
raise NotImplementedError
import os
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from loguru import logger
class DiskReaderWriter(AbsReaderWriter):
def __init__(self, parent_path, encoding="utf-8"):
self.path = parent_path
self.encoding = encoding
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
if not os.path.exists(abspath):
logger.error(f"file {abspath} not exists")
raise Exception(f"file {abspath} no exists")
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "r", encoding=self.encoding) as f:
return f.read()
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "rb") as f:
return f.read()
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
if os.path.isabs(path):
abspath = path
else:
abspath = os.path.join(self.path, path)
directory_path = os.path.dirname(abspath)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
if mode == AbsReaderWriter.MODE_TXT:
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
f.write(content)
elif mode == AbsReaderWriter.MODE_BIN:
with open(abspath, "wb") as f:
f.write(content)
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
def read_offset(self, path: str, offset=0, limit=None):
abspath = path
if not os.path.isabs(path):
abspath = os.path.join(self.path, path)
with open(abspath, "rb") as f:
f.seek(offset)
return f.read(limit)
if __name__ == "__main__":
if 0:
file_path = "io/test/example.txt"
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
# 写入内容到文件
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
# 从文件读取内容
content = drw.read(path=file_path)
if content:
logger.info(f"从 {file_path} 读取的内容: {content}")
if 1:
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
content_bin = drw.read_offset("1.txt")
assert content_bin == b"ABCD!"
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
assert content_bin == b"BC"
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
class S3ReaderWriter(AbsReaderWriter):
def __init__(
self,
ak: str,
sk: str,
endpoint_url: str,
addressing_style: str = "auto",
parent_path: str = "",
):
self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
self.path = parent_path
def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=endpoint_url,
config=Config(
s3={"addressing_style": addressing_style},
retries={"max_attempts": 5, "mode": "standard"},
),
)
return s3_client
def read(self, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
bucket_name, key = parse_bucket_key(s3_path)
res = self.client.get_object(Bucket=bucket_name, Key=key)
body = res["Body"].read()
if mode == AbsReaderWriter.MODE_TXT:
data = body.decode(encoding) # Decode bytes to text
elif mode == AbsReaderWriter.MODE_BIN:
data = body
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
return data
def write(self, content, s3_relative_path, mode=AbsReaderWriter.MODE_TXT, encoding="utf-8"):
if s3_relative_path.startswith("s3://"):
s3_path = s3_relative_path
else:
s3_path = join_path(self.path, s3_relative_path)
if mode == AbsReaderWriter.MODE_TXT:
body = content.encode(encoding) # Encode text data as bytes
elif mode == AbsReaderWriter.MODE_BIN:
body = content
else:
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
bucket_name, key = parse_bucket_key(s3_path)
self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
logger.info(f"内容已写入 {s3_path} ")
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
if path.startswith("s3://"):
s3_path = path
else:
s3_path = join_path(self.path, path)
bucket_name, key = parse_bucket_key(s3_path)
range_header = (
f"bytes={offset}-{offset+limit-1}" if limit else f"bytes={offset}-"
)
res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
return res["Body"].read()
if __name__ == "__main__":
if 0:
# Config the connection info
ak = ""
sk = ""
endpoint_url = ""
addressing_style = "auto"
bucket_name = ""
# Create an S3ReaderWriter object
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, addressing_style, "s3://bucket_name/"
)
# Write text data to S3
text_data = "This is some text data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_TXT,
)
# Read text data from S3
text_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_TXT
)
logger.info(f"Read text data from S3: {text_data_read}")
# Write binary data to S3
binary_data = b"This is some binary data"
s3_reader_writer.write(
text_data,
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json",
mode=AbsReaderWriter.MODE_BIN,
)
# Read binary data from S3
binary_data_read = s3_reader_writer.read(
s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=AbsReaderWriter.MODE_BIN
)
logger.info(f"Read binary data from S3: {binary_data_read}")
# Range Read text data from S3
binary_data_read = s3_reader_writer.read_offset(
path=f"s3://{bucket_name}/ebook/test/test.json", offset=0, limit=10
)
logger.info(f"Read binary data from S3: {binary_data_read}")
if 1:
import os
import json
ak = os.getenv("AK", "")
sk = os.getenv("SK", "")
endpoint_url = os.getenv("ENDPOINT", "")
bucket = os.getenv("S3_BUCKET", "")
prefix = os.getenv("S3_PREFIX", "")
key_basename = os.getenv("S3_KEY_BASENAME", "")
s3_reader_writer = S3ReaderWriter(
ak, sk, endpoint_url, "auto", f"s3://{bucket}/{prefix}"
)
content_bin = s3_reader_writer.read_offset(key_basename)
assert content_bin[:10] == b'{"track_id'
assert content_bin[-10:] == b'r":null}}\n'
content_bin = s3_reader_writer.read_offset(key_basename, offset=424, limit=426)
jso = json.dumps(content_bin.decode("utf-8"))
print(jso)
import os
from pathlib import Path
import shutil
import tempfile
import click
import fitz
from loguru import logger
from pathlib import Path
import magic_pdf.model as model_config
from magic_pdf.data.data_reader_writer import FileBasedDataReader
from magic_pdf.libs.version import __version__
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from magic_pdf.utils.office_to_pdf import convert_file_to_pdf
pdf_suffixes = ['.pdf']
ms_office_suffixes = ['.ppt', '.pptx', '.doc', '.docx']
image_suffixes = ['.png', '.jpeg', '.jpg']
@click.command()
......@@ -21,7 +28,7 @@ from magic_pdf.tools.common import do_parse, parse_pdf_methods
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
help='local filepath or directory. support PDF, PPT, PPTX, DOC, DOCX, PNG, JPG files',
)
@click.option(
'-o',
......@@ -83,12 +90,27 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
temp_dir = tempfile.mkdtemp()
def read_fn(path: Path):
if path.suffix in ms_office_suffixes:
convert_file_to_pdf(str(path), temp_dir)
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
elif path.suffix in image_suffixes:
with open(str(path), 'rb') as f:
bits = f.read()
pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
fn = os.path.join(temp_dir, f"{path.stem}.pdf")
with open(fn, 'wb') as f:
f.write(pdf_bytes)
elif path.suffix in pdf_suffixes:
fn = str(path)
else:
raise Exception(f"Unknown file suffix: {path.suffix}")
disk_rw = FileBasedDataReader(os.path.dirname(fn))
return disk_rw.read(os.path.basename(fn))
def read_fn(path):
disk_rw = FileBasedDataReader(os.path.dirname(path))
return disk_rw.read(os.path.basename(path))
def parse_doc(doc_path: str):
def parse_doc(doc_path: Path):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
......@@ -108,10 +130,13 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
logger.exception(e)
if os.path.isdir(path):
for doc_path in Path(path).glob('*.pdf'):
parse_doc(doc_path)
for doc_path in Path(path).glob('*'):
if doc_path.suffix in pdf_suffixes + image_suffixes + ms_office_suffixes:
parse_doc(doc_path)
else:
parse_doc(path)
parse_doc(Path(path))
shutil.rmtree(temp_dir)
if __name__ == '__main__':
......
......@@ -9,8 +9,9 @@ from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.config.make_content_config import DropMode, MakeMode
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.draw_bbox import draw_char_bbox
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.model.operators import InferenceResult
from magic_pdf.operators.models import InferenceResult
# from io import BytesIO
# from pypdf import PdfReader, PdfWriter
......@@ -83,6 +84,7 @@ def do_parse(
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
f_draw_char_bbox=False,
start_page_id=0,
end_page_id=None,
lang=None,
......@@ -94,9 +96,7 @@ def do_parse(
logger.warning('debug mode is on')
f_draw_model_bbox = True
f_draw_line_sort_bbox = True
if lang == '':
lang = None
# f_draw_char_bbox = True
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(
pdf_bytes, start_page_id, end_page_id
......@@ -109,7 +109,7 @@ def do_parse(
)
image_dir = str(os.path.basename(local_image_dir))
ds = PymuDocDataset(pdf_bytes)
ds = PymuDocDataset(pdf_bytes, lang=lang)
if len(model_list) == 0:
if model_config.__use_inside_model__:
......@@ -118,50 +118,50 @@ def do_parse(
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=lang,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
image_writer, debug_mode=True, lang=ds._lang
)
else:
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=lang,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
image_writer, debug_mode=True, lang=ds._lang
)
elif parse_method == 'txt':
infer_result = ds.apply(
doc_analyze,
ocr=False,
lang=lang,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
image_writer, debug_mode=True, lang=ds._lang
)
elif parse_method == 'ocr':
infer_result = ds.apply(
doc_analyze,
ocr=True,
lang=lang,
lang=ds._lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
image_writer, debug_mode=True, lang=ds._lang
)
else:
logger.error('unknown parse method')
......@@ -170,19 +170,26 @@ def do_parse(
logger.error('need model list input')
exit(2)
else:
infer_result = InferenceResult(model_list, ds)
if parse_method == 'ocr':
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=lang
image_writer, debug_mode=True, lang=ds._lang
)
elif parse_method == 'txt':
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=lang
image_writer, debug_mode=True, lang=ds._lang
)
else:
pipe_result = infer_result.pipe_auto_mode(
image_writer, debug_mode=True, lang=lang
)
if ds.classify() == SupportedPdfParseMethod.TXT:
pipe_result = infer_result.pipe_txt_mode(
image_writer, debug_mode=True, lang=ds._lang
)
else:
pipe_result = infer_result.pipe_ocr_mode(
image_writer, debug_mode=True, lang=ds._lang
)
if f_draw_model_bbox:
infer_result.draw_model(
......@@ -201,6 +208,9 @@ def do_parse(
os.path.join(local_md_dir, f'{pdf_file_name}_line_sort.pdf')
)
if f_draw_char_bbox:
draw_char_bbox(pdf_bytes, local_md_dir, f'{pdf_file_name}_char_bbox.pdf')
if f_dump_md:
pipe_result.dump_md(
md_writer,
......
"""用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
from loguru import logger
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
from magic_pdf.config.constants import PARSE_TYPE_TXT, PARSE_TYPE_OCR
def parse_txt_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析文本类pdf."""
pdf_info_dict = parse_pdf_by_txt(
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
lang=lang,
)
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
pdf_info_dict['_version_name'] = __version__
if lang is not None:
pdf_info_dict['_lang'] = lang
return pdf_info_dict
def parse_ocr_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""解析ocr类pdf."""
pdf_info_dict = parse_pdf_by_ocr(
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
lang=lang,
)
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
pdf_info_dict['_version_name'] = __version__
if lang is not None:
pdf_info_dict['_lang'] = lang
return pdf_info_dict
def parse_union_pdf(
dataset: Dataset,
model_list: list,
imageWriter: DataWriter,
is_debug=False,
start_page_id=0,
end_page_id=None,
lang=None,
*args,
**kwargs
):
"""ocr和文本混合的pdf,全部解析出来."""
def parse_pdf(method):
try:
return method(
dataset,
model_list,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
lang=lang,
)
except Exception as e:
logger.exception(e)
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
if len(model_list) == 0:
layout_model = kwargs.get('layout_model', None)
formula_enable = kwargs.get('formula_enable', None)
table_enable = kwargs.get('table_enable', None)
infer_res = doc_analyze(
dataset,
ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
model_list = infer_res.get_infer_res()
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
else:
pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
else:
pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
pdf_info_dict['_version_name'] = __version__
if lang is not None:
pdf_info_dict['_lang'] = lang
return pdf_info_dict
import os
import subprocess
from pathlib import Path
class ConvertToPdfError(Exception):
def __init__(self, msg):
self.msg = msg
super().__init__(self.msg)
def convert_file_to_pdf(input_path, output_dir):
if not os.path.isfile(input_path):
raise FileNotFoundError(f"The input file {input_path} does not exist.")
os.makedirs(output_dir, exist_ok=True)
cmd = [
'soffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(input_path)
]
process = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if process.returncode != 0:
raise ConvertToPdfError(process.stderr.decode())
This diff is collapsed.
This diff is collapsed.
......@@ -4,8 +4,11 @@ Glossary
===========
1. jsonl
TODO: add description
Newline-delimited (\n), and each line must be a valid, independent JSON object.
Currently, All the function shipped with **MinerU** assume that json object must contain one field named with either **path** or **file_location**
2. magic-pdf.json
TODO
2. magic-pdf.json
TODO: add description
......@@ -2,7 +2,7 @@
Model Api
==========
.. autoclass:: magic_pdf.model.InferenceResultBase
.. autoclass:: magic_pdf.operators.InferenceResultBase
:members:
:inherited-members:
:show-inheritance:
......@@ -3,7 +3,7 @@
Pipeline Api
=============
.. autoclass:: magic_pdf.pipe.operators.PipeResult
.. autoclass:: magic_pdf.operators.pipes.PipeResult
:members:
:inherited-members:
:show-inheritance:
\ No newline at end of file
:show-inheritance:
......@@ -70,6 +70,12 @@ Key Features
- Supports both CPU and GPU environments.
- Compatible with Windows, Linux, and Mac platforms.
.. tip::
Get started with MinerU by trying the `online demo <https://www.modelscope.cn/studios/OpenDataLab/MinerU>`_ or :doc:`installing it locally <user_guide/install/install>`.
User Guide
-------------
.. toctree::
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment