Commit 826086d2 authored by zhougaofeng's avatar zhougaofeng
Browse files

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc,...

Deleted magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/__init__.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/Constants.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/pipe/__init__.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/detect_para.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/rw/__init__.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/pdf_server.py, magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/user_api.py files
parent 57aaa1cf
import json as json_parse
import os
from pathlib import Path
import click
import magic_pdf.model as model_config
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
remove_non_official_s3_args(s3path))
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(
may_range_params[1])
return s3_rw.read_offset(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
)
@click.group()
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
def cli():
pass
@cli.command()
@click.option(
'-j',
'--jsonl',
'jsonl',
type=str,
help='输入 jsonl 路径,本地或者 s3 上的文件',
required=True,
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='输出到本地目录',
)
def jsonl(jsonl, method, output_dir):
model_config.__use_inside_model__ = False
if jsonl.startswith('s3://'):
jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
else:
with open(jsonl) as f:
jso = json_parse.loads(f.readline())
os.makedirs(output_dir, exist_ok=True)
s3_file_path = jso.get('file_location')
if s3_file_path is None:
s3_file_path = jso.get('path')
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
print(pdf_file_name, jso, method)
do_parse(
output_dir,
pdf_file_name,
pdf_data,
jso['doc_layout_result'],
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
@cli.command()
@click.option(
'-p',
'--pdf',
'pdf',
type=click.Path(exists=True),
required=True,
help='本地 PDF 文件',
)
@click.option(
'-j',
'--json',
'json_data',
type=click.Path(exists=True),
required=True,
help='本地模型推理出的 json 数据',
)
@click.option('-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='本地输出目录')
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
def pdf(pdf, json_data, output_dir, method):
model_config.__use_inside_model__ = False
full_pdf_path = os.path.realpath(pdf)
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
file_name = str(Path(full_pdf_path).stem)
pdf_data = read_fn(full_pdf_path)
do_parse(
output_dir,
file_name,
pdf_data,
model_json_list,
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
if __name__ == '__main__':
cli()
import copy
import json as json_parse
import os
import click
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_span_bbox,
drow_model_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
def prepare_env(output_dir, pdf_file_name, method):
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
local_image_dir = os.path.join(str(local_parent_dir), 'images')
local_md_dir = local_parent_dir
# logger.info(f'local_image_dir:{local_image_dir}')
# logger.info(f'local_md_dir:{local_md_dir}')
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def remove_empty_lines_from_file(file_path):
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 删除空行
non_empty_lines = [line for line in lines if line.strip()]
# 将非空行写回原文件
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(non_empty_lines)
def do_parse(
ocr_status,
config_path,
output_dir,
pdf_file_name,
pdf_bytes,
model_list,
parse_method,
debug_able,
model,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=False,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
start_page_id=0,
end_page_id=None,
):
if debug_able:
logger.warning('debug mode is on')
f_dump_content_list = True
f_draw_model_bbox = True
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method)
image_writer, md_writer = DiskReaderWriter(
local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=False,
start_page_id=start_page_id, end_page_id=end_page_id)
elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=False,
start_page_id=start_page_id, end_page_id=end_page_id)
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=False,
start_page_id=start_page_id, end_page_id=end_page_id)
else:
logger.error('unknown parse method')
exit(1)
# 判断是文本pdf,还是ocr pdf
pipe.pipe_classify()
if len(model_list) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze(model)
orig_model_list = copy.deepcopy(pipe.model_list)
else:
logger.error('need model list input')
exit(2)
pipe.pipe_parse(ocr_status,config_path,local_image_dir)
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(ocr_status,config_path,local_image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
try:
md_writer.write(
content=md_content,
path=f'{pdf_file_name}.txt',
mode=AbsReaderWriter.MODE_TXT,
)
filepath = os.path.join(str(local_md_dir), f'{pdf_file_name}.txt')
# logger.info(f'txt文件保存在filepath:{filepath}')
remove_empty_lines_from_file(filepath)
if f_dump_orig_pdf:
md_writer.write(
content=pdf_bytes,
path=f'{pdf_file_name}_origin.pdf',
mode=AbsReaderWriter.MODE_BIN,
)
# logger.info(f'local output dir is {local_md_dir}')
return filepath
except Exception as e:
logger.error(f'{pdf_file_name}导出txt文件失败,具体原因为:\n{e}')
return None
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
import os
from pathlib import Path
import click
from loguru import logger
from typing import List
from fastapi import FastAPI, HTTPException, Request
import magic_pdf.model as model_config
from magic_pdf.dict2md.ocr_client import PredictClient
# from magic_pdf.dict2md.ocr_vllm_client import PredictClient
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from argparse import ArgumentParser
from pydantic import BaseModel
import uvicorn
import time
import configparser
#from magic_pdf.tools.config import update_config
app = FastAPI()
method = 'auto'
logger.add("parse.log", rotation="10 MB", level="INFO",
format="{time} {level} {message}", encoding='utf-8', enqueue=True)
config_path = None
ocr_status = None
custom_model = None
class ocrRequest(BaseModel):
path: str
output_dir: str
class ocrResponse(BaseModel):
status_code: int
output_path: str
def parse_args():
parser = ArgumentParser()
parser.add_argument(
'--dcu_id',
default='0',
help='设置DCU')
parser.add_argument(
'--method',
type=parse_pdf_methods,
help = """the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default = 'auto',
)
parser.add_argument(
'--debug',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
parser.add_argument(
'--config_path',
default='/home/practice/magic_pdf-main/magic_pdf/config.ini')
args = parser.parse_args()
return args
def ocr_pdf_serve(args: str):
os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id
config = configparser.ConfigParser()
config.read(args.config_path)
# host = config.get('server', 'pdf_host')
# port = int(config.get('server', 'pdf_port'))
pdf_server = config.get('server', 'pdf_server').split('://')[1]
host, port = pdf_server.split(':')[0], int(pdf_server.split(':')[1])
global config_path
config_path = args.config_path
ocr_server = config.get('server', 'ocr_server')
ocr_client = PredictClient(ocr_server)
global ocr_status
ocr_status = ocr_client.check_health()
ocr = True
show_log = False
model_manager = ModelSingleton()
global custom_model
custom_model = model_manager.get_model(ocr, show_log)
uvicorn.run(app, host=host, port=port)
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.post("/pdf_ocr")
# def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
async def pdf_ocr(request: ocrRequest):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
output_dir = request.output_dir
path = request.path
#config_path = request.config_path
os.makedirs(output_dir, exist_ok=True)
debug_able = False
start_page_id = 0
end_page_id = None
logger.info(f"正在处理文件: {path}")
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str, config_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
output_path = do_parse(
ocr_status,
config_path,
output_dir,
file_name,
pdf_data,
[],
method,
debug_able,
model=custom_model,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
# logger.info(f'文件解析成功:{output_path}')
return output_path
except Exception as e:
logger.exception(e)
# logger.info(f'config_path:{config_path}')
output_path = parse_doc(path,config_path)
if output_path:
logger.info(f'文件解析成功:{output_path}')
return {"status_code": 200, "output_path": output_path}
else:
logger.error(f'文件解析失败,文件为:{path}')
raise HTTPException(status_code=500)
def main():
args = parse_args()
ocr_pdf_serve(args)
if __name__ == '__main__':
main()
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
解析文本类pdf
"""
# print('----------------------------------------------------------这是解析文本类pdf-------------------------------------------------------------')
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict
def parse_ocr_pdf(ocr_status,config_path,local_image_dir,pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
解析ocr类pdf
"""
# print('---------------------------------------------------------这是解析ocr类pdf------------------------------------------------------------------')
pdf_info_dict = parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
pdf_info_dict["_version_name"] = __version__
return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
# logger.info('---------------------------------------------------------ocr和文本混合的pdf,全部解析出来------------------------------------------------')
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
except Exception as e:
logger.exception(e)
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
pdf_models = doc_analyze(pdf_bytes, ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
# logger.info(f'这是pdf_union_pdf中的pdf_dict:\n{pdf_info_dict}\n-----------------------------------------------------------------------------------------')
return pdf_info_dict
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment