Commit c9171d1f authored by zhougaofeng's avatar zhougaofeng
Browse files

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json,...

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json, requirements.txt, requirements-docker.txt, requirements-qa.txt, update_version.py, setup.py, magic_pdf/__init__.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/tmp.py, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/pdf_client.py, magic_pdf/tools/common.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/cli.py, magic_pdf/tools/pdf_server.py files
parent 748e3b56
Pipeline #1783 canceled with stages
from loguru import logger
from magic_pdf.libs.drop_reason import DropReason
def get_data_source(jso: dict):
data_source = jso.get("data_source")
if data_source is None:
data_source = jso.get("file_source")
return data_source
def get_data_type(jso: dict):
data_type = jso.get("data_type")
if data_type is None:
data_type = jso.get("file_type")
return data_type
def get_bookid(jso: dict):
book_id = jso.get("bookid")
if book_id is None:
book_id = jso.get("original_file_id")
return book_id
def exception_handler(jso: dict, e):
logger.exception(e)
jso["_need_drop"] = True
jso["_drop_reason"] = DropReason.Exception
jso["_exception"] = f"ERROR: {e}"
return jso
def get_bookname(jso: dict):
data_source = get_data_source(jso)
file_id = jso.get("file_id")
book_name = f"{data_source}/{file_id}"
return book_name
def spark_json_extractor(jso: dict) -> dict:
"""
从json中提取数据,返回一个dict
"""
return {
"_pdf_type": jso["_pdf_type"],
"model_list": jso["doc_layout_result"],
}
import os
from pathlib import Path
import click
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
@click.command()
@click.version_option(__version__,
'--version',
'-v',
help='display the version and exit')
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='output local directory',
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help="""the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default='auto',
)
@click.option(
'-d',
'--debug',
'debug_able',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
@click.option(
'-s',
'--start',
'start_page_id',
type=int,
help='The starting page for PDF parsing, beginning from 0.',
default=0,
)
@click.option(
'-e',
'--end',
'end_page_id',
type=int,
help='The ending page for PDF parsing, beginning from 0.',
default=None,
)
def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
output_dir,
file_name,
pdf_data,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
# 查找所有的pdf文件
for file in files:
if file.endswith('.pdf'):
# 打印pdf文件的完整路径
doc_path = os.path.join(root, file)
logger.info(f'正在解析:{doc_path}')
parse_doc(doc_path)
else:
#logger.info(f'正在解析:{doc_path}')
parse_doc(path)
if __name__ == '__main__':
cli()
import json as json_parse
import os
from pathlib import Path
import click
import magic_pdf.model as model_config
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
remove_non_official_s3_args(s3path))
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(
may_range_params[1])
return s3_rw.read_offset(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
)
@click.group()
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
def cli():
pass
@cli.command()
@click.option(
'-j',
'--jsonl',
'jsonl',
type=str,
help='输入 jsonl 路径,本地或者 s3 上的文件',
required=True,
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='输出到本地目录',
)
def jsonl(jsonl, method, output_dir):
model_config.__use_inside_model__ = False
if jsonl.startswith('s3://'):
jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
else:
with open(jsonl) as f:
jso = json_parse.loads(f.readline())
os.makedirs(output_dir, exist_ok=True)
s3_file_path = jso.get('file_location')
if s3_file_path is None:
s3_file_path = jso.get('path')
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
print(pdf_file_name, jso, method)
do_parse(
output_dir,
pdf_file_name,
pdf_data,
jso['doc_layout_result'],
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
@cli.command()
@click.option(
'-p',
'--pdf',
'pdf',
type=click.Path(exists=True),
required=True,
help='本地 PDF 文件',
)
@click.option(
'-j',
'--json',
'json_data',
type=click.Path(exists=True),
required=True,
help='本地模型推理出的 json 数据',
)
@click.option('-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='本地输出目录')
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
def pdf(pdf, json_data, output_dir, method):
model_config.__use_inside_model__ = False
full_pdf_path = os.path.realpath(pdf)
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
file_name = str(Path(full_pdf_path).stem)
pdf_data = read_fn(full_pdf_path)
do_parse(
output_dir,
file_name,
pdf_data,
model_json_list,
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
if __name__ == '__main__':
cli()
import copy
import json as json_parse
import os
import click
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
drow_model_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
def prepare_env(output_dir, pdf_file_name, method):
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
local_image_dir = os.path.join(str(local_parent_dir), 'images')
local_md_dir = local_parent_dir
logger.info(f'local_image_dir:{local_image_dir}')
logger.info(f'local_md_dir:{local_md_dir}')
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def remove_empty_lines_from_file(file_path):
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 删除空行
non_empty_lines = [line for line in lines if line.strip()]
# 将非空行写回原文件
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(non_empty_lines)
def do_parse(
output_dir,
pdf_file_name,
pdf_bytes,
model_list,
parse_method,
debug_able,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=False,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
start_page_id=0,
end_page_id=None,
):
if debug_able:
logger.warning('debug mode is on')
f_dump_content_list = True
f_draw_model_bbox = True
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method)
image_writer, md_writer = DiskReaderWriter(
local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
# logger.info(f'model_list:{model_list}')
# logger.info(f'local_image_dir:::{local_image_dir}')
logger.info(f'image_dir:::{image_dir}')
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id)
elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id)
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id)
else:
logger.error('unknown parse method')
exit(1)
# 判断是文本pdf,还是ocr pdf
pipe.pipe_classify()
if len(model_list) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
# logger.info(f'执行pipe.pipe_analyze()之后的pipe.model_list:{pipe.model_list}')
orig_model_list = copy.deepcopy(pipe.model_list)
else:
logger.error('need model list input')
exit(2)
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# if f_draw_span_bbox:
# draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
# if f_draw_model_bbox:
# drow_model_bbox(copy.deepcopy(orig_model_list), pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(local_image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
logger.info(f'-'*100)
logger.info(f'md_content:\n{md_content}')
logger.info(f'-'*100)
try:
md_writer.write(
content=md_content,
path=f'{pdf_file_name}.txt',
mode=AbsReaderWriter.MODE_TXT,
)
except Exception as e:
logger.info(f'{pdf_file_name}导出txt文件失败,具体原因为:\n{e}')
filepath = os.path.join(str(local_md_dir), f'{pdf_file_name}.txt')
logger.info(f'txt文件保存在filepath:{filepath}')
remove_empty_lines_from_file(filepath)
# if f_dump_md:
# md_writer.write(
# content=md_content,
# path=f'{pdf_file_name}.md',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
# if f_dump_middle_json:
# md_writer.write(
# content=json_parse.dumps(pipe.pdf_mid_data,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_middle.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
# if f_dump_model_json:
# md_writer.write(
# content=json_parse.dumps(orig_model_list,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_model.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
#
if f_dump_orig_pdf:
md_writer.write(
content=pdf_bytes,
path=f'{pdf_file_name}_origin.pdf',
mode=AbsReaderWriter.MODE_BIN,
)
#
# content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
# if f_dump_content_list:
# md_writer.write(
# content=json_parse.dumps(content_list,
# ensure_ascii=False,
# indent=4),
# path=f'{pdf_file_name}_content_list.json',
# mode=AbsReaderWriter.MODE_TXT,
# )
logger.info(f'local output dir is {local_md_dir}')
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
# -*- coding: utf-8 -*-
import time
import requests
from loguru import logger
import argparse
import os
class ocrPdfClient:
def __init__(self, api_url):
self.api_url = api_url
def ocr_pdf_client(self, path,output_dir):
payload = {
"path": str(path),
"output_dir": str(output_dir),
}
logger.info(f'pdf路径:{path},输出路径{output_dir}')
response = requests.post(f"{self.api_url}/pdf_ocr", json=payload)
logger.info(f'response:{response}')
if response.status_code == 200:
return output_dir
else:
raise Exception(f"ocrPdf API request failed with status code {response.status_code}")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--url',
default='http://0.0.0.0:6030',
)
parser.add_argument(
'--path',
'-p',
required=True
)
parser.add_argument(
'--output_dir',
'-o',
required=True
)
args = parser.parse_args()
return args
def main():
args = parse_args()
embedder = ocrPdfClient(args.url)
doc_analyze_start = time.time()
if not os.path.isabs(args.output_dir):
current_working_directory = os.getcwd()
output_dir = os.path.join(current_working_directory, args.output_dir)
# logger.info(f'相对路径output_dir:{output_dir}')
else:
output_dir = args.output_dir
logger.info(f'output_dir:{output_dir}')
try:
res = embedder.ocr_pdf_client(path=args.path,output_dir=output_dir)
if res:
logger.info(f"output_dir: '{res}'")
else:
logger.warning("None")
except requests.exceptions.RequestException as e:
logger.error(f"Error while making request to reranker service: {e}")
except Exception as e:
logger.error(f"Unexpected error occurred: {e}")
doc_analyze_cost = time.time() - doc_analyze_start
logger.info(f'解析当前pdf{args.path}耗时为:{doc_analyze_cost}')
if __name__ == "__main__":
main()
import os
from pathlib import Path
import click
from loguru import logger
from typing import List
from fastapi import FastAPI, HTTPException, Request
import magic_pdf.model as model_config
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from argparse import ArgumentParser
from pydantic import BaseModel
import uvicorn
import time
app = FastAPI()
method = 'auto'
class ocrRequest(BaseModel):
path: str
output_dir: str
def parse_args():
parser = ArgumentParser()
parser.add_argument(
'--dcu_id',
default='0',
help='设置DCU')
parser.add_argument(
'--pdf_port',
default=6030,
help='设置DCU')
parser.add_argument(
'--method',
type=parse_pdf_methods,
help = """the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default = 'auto',
)
# parser.add_argument(
# '--start',
# type=int,
# help='The starting page for PDF parsing, beginning from 0.',
# default=0,
# )
# parser.add_argument(
# '--end',
# type=int,
# help='The ending page for PDF parsing, beginning from 0.',
# default=None,
# )
parser.add_argument(
'--debug',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
args = parser.parse_args()
return args
def ocr_pdf_serve(args: str):
os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id
uvicorn.run(app, host="0.0.0.0", port=args.pdf_port)
@app.post("/pdf_ocr")
# def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
def pdf_ocr(request: ocrRequest):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
output_dir = request.output_dir
path = request.path
os.makedirs(output_dir, exist_ok=True)
debug_able = False
start_page_id = 0
end_page_id = None
logger.info(f'method:{method},path:{path},output_dir{output_dir}')
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
do_parse(
output_dir,
file_name,
pdf_data,
[],
method,
debug_able,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
except Exception as e:
logger.exception(e)
if os.path.isdir(path):
for root, dirs, files in os.walk(path):
# 查找所有的pdf文件
for file in files:
if file.endswith('.pdf'):
# 打印pdf文件的完整路径
doc_path = os.path.join(root, file)
start = time.time()
logger.info(f'正在解析:{doc_path}')
parse_doc(doc_path)
end = time.time()
logger.info(f'解析:{doc_path}的耗时为:{end -start}')
else:
logger.info(f'正在解析:{path}')
parse_doc(path)
def main():
args = parse_args()
ocr_pdf_serve(args)
if __name__ == '__main__':
main()
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
解析文本类pdf
"""
print('----------------------------------------------------------这是解析文本类pdf-------------------------------------------------------------')
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
解析ocr类pdf
"""
print('---------------------------------------------------------这是解析ocr类pdf------------------------------------------------------------------')
pdf_info_dict = parse_pdf_by_ocr(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
logger.info('---------------------------------------------------------ocr和文本混合的pdf,全部解析出来------------------------------------------------')
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
except Exception as e:
logger.exception(e)
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
logger.info(f'这是pdf_union_pdf中的pdf_dict:\n{pdf_info_dict}\n-----------------------------------------------------------------------------------------')
return pdf_info_dict
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
PyMuPDF>=1.24.9
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
fast-langdetect==0.2.0
wordninja>=2.0.0
scikit-learn>=1.0.2
pdfminer.six==20231228
unimernet==0.1.6
matplotlib
ultralytics
paddleocr==2.7.3
paddlepaddle==3.0.0b1
pypandoc
struct-eqtable==0.1.0
detectron2
pytest
Levenshtein
nltk
rapidfuzz
statistics
openxlab #安装opendatalab
pandas
numpy
matplotlib
seaborn
scipy
scikit-learn
tqdm
htmltabletomd
pypandoc
pyopenssl==24.0.0
struct-eqtable==0.1.0
pytest-cov
beautifulsoup4
\ No newline at end of file
boto3>=1.28.43
Brotli>=1.1.0
click>=8.1.7
fast-langdetect==0.2.0
loguru>=0.6.0
numpy>=1.21.6,<2.0.0
pdfminer.six==20231228
pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9
scikit-learn>=1.0.2
wordninja>=2.0.0
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
from pathlib import Path
from setuptools import setup, find_packages
from magic_pdf.libs.version import __version__
def parse_requirements(filename):
with open(filename) as f:
lines = f.read().splitlines()
requires = []
for line in lines:
if "http" in line:
pkg_name_without_url = line.split('@')[0].strip()
requires.append(pkg_name_without_url)
else:
requires.append(line)
return requires
if __name__ == '__main__':
with Path(Path(__file__).parent,
'README.md').open(encoding='utf-8') as file:
long_description = file.read()
setup(
name="magic_pdf", # 项目名
version=__version__, # 自动从tag中获取版本号
packages=find_packages() + ["magic_pdf.resources"], # 包含所有的包
package_data={
"magic_pdf.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件
},
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
extras_require={
"lite": ["paddleocr==2.7.3",
"paddlepaddle==3.0.0b1;platform_system=='Linux'",
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'",
],
"full": ["unimernet==0.1.6", # 0.1.6版本大幅裁剪依赖包范围,推荐使用此版本
"matplotlib<=3.9.0;platform_system=='Windows'", # 3.9.1及之后不提供windows的预编译包,避免一些没有编译环境的windows设备安装失败
"matplotlib;platform_system=='Linux' or platform_system=='Darwin'", # linux 和 macos 不应限制matplotlib的最高版本,以避免无法更新导致的一些bug
"ultralytics", # yolov8,公式检测
"paddleocr==2.7.3", # 2.8.0及2.8.1版本与detectron2有冲突,需锁定2.7.3
"paddlepaddle==3.0.0b1;platform_system=='Linux'", # 解决linux的段异常问题
"paddlepaddle==2.6.1;platform_system=='Windows' or platform_system=='Darwin'", # windows版本3.0.0b1效率下降,需锁定2.6.1
"pypandoc", # 表格解析latex转html
"struct-eqtable==0.1.0", # 表格解析
"detectron2"
],
},
description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述
long_description_content_type="text/markdown", # 如果README是Markdown格式
url="https://github.com/opendatalab/MinerU",
python_requires=">=3.9", # 项目依赖的 Python 版本
entry_points={
"console_scripts": [
"magic-pdf = magic_pdf.tools.cli:cli",
"magic-pdf-dev = magic_pdf.tools.cli_dev:cli"
],
}, # 项目提供的可执行命令
include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等
zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False
)
import os
import subprocess
def get_version():
command = ["git", "describe", "--tags"]
try:
version = subprocess.check_output(command).decode().strip()
version_parts = version.split("-")
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
return version_parts[1]
else:
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
except Exception as e:
print(e)
return "0.0.0"
def write_version_to_commons(version):
commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
with open(commons_path, 'w') as f:
f.write(f'__version__ = "{version}"\n')
if __name__ == '__main__':
version_name = get_version()
write_version_to_commons(version_name)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment