Commit 2df265c8 authored by zhougaofeng's avatar zhougaofeng
Browse files

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py,...

Update magic_pdf/__init__.py, magic_pdf/config.ini, magic_pdf/tmp.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/pdf_parse_union_core_v2.py, magic_pdf/config/__init__.py, magic_pdf/config/enums.py, magic_pdf/config/exceptions.py, magic_pdf/data/__init__.py, magic_pdf/data/schemas.py, magic_pdf/data/dataset.py, magic_pdf/data/utils.py, magic_pdf/data/read_api.py, magic_pdf/data/data_reader_writer/__init__.py, magic_pdf/data/data_reader_writer/base.py, magic_pdf/data/data_reader_writer/filebase.py, magic_pdf/data/data_reader_writer/s3.py, magic_pdf/data/data_reader_writer/multi_bucket_s3.py, magic_pdf/data/io/__init__.py, magic_pdf/data/io/base.py, magic_pdf/data/io/s3.py, magic_pdf/data/io/http.py, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/ocr_vllm_client.py, magic_pdf/dict2md/ocr_vllm_server.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/utils.py, magic_pdf/integrations/rag/type.py, magic_pdf/layout/__init__.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/bbox_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/clean_memory.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/language.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/local_math.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/vis_utils.py, magic_pdf/libs/textbase.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/version.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/commons.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/commons.py, magic_pdf/para/draw.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/raw_processor.py, magic_pdf/para/title_processor.py, magic_pdf/para/para_split.py, magic_pdf/para/denoise.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/para_split_v3.py, magic_pdf/para/stats.py, magic_pdf/para/exceptions.py, magic_pdf/parse/__init__.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/ofd_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_spaces_html.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/draw_ofd.py, magic_pdf/rw/ofdtemplate.py, magic_pdf/rw/pdf_parse.py, magic_pdf/rw/draw_pdf.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/cli.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/common.py, magic_pdf/tools/file_deal.py, magic_pdf/tools/img_deal.py, magic_pdf/tools/find_seal_img.py, magic_pdf/tools/font_tools.py, magic_pdf/tools/file_parser.py, magic_pdf/tools/parameter_parser.py, magic_pdf/tools/ofd.py, magic_pdf/tools/pdf_server.py, magic_pdf/tools/ofd_parser.py, magic_pdf/utils/__init__.py, magic_pdf/utils/annotations.py files
parent 826086d2
import json as json_parse
import os
from pathlib import Path
import click
import magic_pdf.model as model_config
from magic_pdf.libs.config_reader import get_s3_config
from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
remove_non_official_s3_args)
from magic_pdf.libs.version import __version__
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
def read_s3_path(s3path):
bucket, key = parse_s3path(s3path)
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
remove_non_official_s3_args(s3path))
may_range_params = parse_s3_range_params(s3path)
if may_range_params is None or 2 != len(may_range_params):
byte_start, byte_end = 0, None
else:
byte_start, byte_end = int(may_range_params[0]), int(
may_range_params[1])
return s3_rw.read_offset(
remove_non_official_s3_args(s3path),
byte_start,
byte_end,
)
@click.group()
@click.version_option(__version__, '--version', '-v', help='显示版本信息')
def cli():
pass
@cli.command()
@click.option(
'-j',
'--jsonl',
'jsonl',
type=str,
help='输入 jsonl 路径,本地或者 s3 上的文件',
required=True,
)
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
@click.option(
'-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='输出到本地目录',
)
def jsonl(jsonl, method, output_dir):
model_config.__use_inside_model__ = False
if jsonl.startswith('s3://'):
jso = json_parse.loads(read_s3_path(jsonl).decode('utf-8'))
else:
with open(jsonl) as f:
jso = json_parse.loads(f.readline())
os.makedirs(output_dir, exist_ok=True)
s3_file_path = jso.get('file_location')
if s3_file_path is None:
s3_file_path = jso.get('path')
pdf_file_name = Path(s3_file_path).stem
pdf_data = read_s3_path(s3_file_path)
print(pdf_file_name, jso, method)
do_parse(
output_dir,
pdf_file_name,
pdf_data,
jso['doc_layout_result'],
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
@cli.command()
@click.option(
'-p',
'--pdf',
'pdf',
type=click.Path(exists=True),
required=True,
help='本地 PDF 文件',
)
@click.option(
'-j',
'--json',
'json_data',
type=click.Path(exists=True),
required=True,
help='本地模型推理出的 json 数据',
)
@click.option('-o',
'--output-dir',
'output_dir',
type=click.Path(),
required=True,
help='本地输出目录')
@click.option(
'-m',
'--method',
'method',
type=parse_pdf_methods,
help='指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法',
default='auto',
)
def pdf(pdf, json_data, output_dir, method):
model_config.__use_inside_model__ = False
full_pdf_path = os.path.realpath(pdf)
os.makedirs(output_dir, exist_ok=True)
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))
file_name = str(Path(full_pdf_path).stem)
pdf_data = read_fn(full_pdf_path)
do_parse(
output_dir,
file_name,
pdf_data,
model_json_list,
method,
False,
f_dump_content_list=True,
f_draw_model_bbox=True,
)
if __name__ == '__main__':
cli()
import copy
import json as json_parse
import os
import click
from loguru import logger
import magic_pdf.model as model_config
from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
draw_model_bbox, draw_span_bbox)
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.post_proc.remove_spaces_html import remove_extra_spaces_html_txt
def prepare_env(output_dir, pdf_file_name, method):
local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
local_image_dir = os.path.join(str(local_parent_dir), 'images')
local_md_dir = local_parent_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)
return local_image_dir, local_md_dir
def remove_empty_lines_from_file(file_path):
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 删除空行
non_empty_lines = [line for line in lines if line.strip()]
# 将非空行写回原文件
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(non_empty_lines)
def do_parse(
ocr_status,
config_path,
output_dir,
pdf_file_name,
pdf_bytes,
model_list,
parse_method,
debug_able,
model,
f_draw_span_bbox=True,
f_draw_layout_bbox=True,
f_dump_md=True,
f_dump_middle_json=True,
f_dump_model_json=True,
f_dump_orig_pdf=True,
f_dump_content_list=True,
f_make_md_mode=MakeMode.MM_MD,
f_draw_model_bbox=False,
f_draw_line_sort_bbox=False,
start_page_id=0,
end_page_id=None,
lang=None,
layout_model=None,
formula_enable=None,
table_enable=None,
):
if debug_able:
logger.warning('debug mode is on')
f_draw_model_bbox = True
f_draw_line_sort_bbox = True
orig_model_list = copy.deepcopy(model_list)
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
parse_method)
image_writer, md_writer = DiskReaderWriter(
local_image_dir), DiskReaderWriter(local_md_dir)
image_dir = str(os.path.basename(local_image_dir))
if parse_method == 'auto':
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'txt':
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
elif parse_method == 'ocr':
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
else:
logger.error('unknown parse method')
exit(1)
pipe.pipe_classify()
if len(model_list) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze(model)
orig_model_list = copy.deepcopy(pipe.model_list)
else:
logger.error('need model list input')
exit(2)
pipe.pipe_parse(ocr_status,config_path,local_image_dir)
pdf_info = pipe.pdf_mid_data['pdf_info']
if f_draw_layout_bbox:
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
md_content = pipe.pipe_mk_markdown(image_dir,
drop_mode=DropMode.NONE,
md_make_mode=f_make_md_mode)
try:
txt_file = f'{pdf_file_name}.txt'
md_writer.write(
content=md_content,
path=txt_file,
mode=AbsReaderWriter.MODE_TXT,
)
filepath = os.path.join(str(local_md_dir), f'{pdf_file_name}.txt')
# logger.info(f'txt文件保存在filepath:{filepath}')
remove_empty_lines_from_file(filepath)
if f_dump_orig_pdf:
md_writer.write(
content=pdf_bytes,
path=f'{pdf_file_name}_origin.pdf',
mode=AbsReaderWriter.MODE_BIN,
)
remove_extra_spaces_html_txt(filepath)
return filepath
except Exception as e:
logger.error(f'{pdf_file_name}导出txt文件失败,具体原因为:\n{e}')
return None
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#PROJECT_NAME: D:\code\easyofd\easyofd\parser
#CREATE_TIME: 2023-07-27
#E_MAIL: renoyuan@foxmail.com
#AUTHOR: reno
#NOTE: 文件处理
import os
import base64
import shutil
from typing import Any
from uuid import uuid1
import xmltodict
import zipfile
class FileRead(object):
"""
文件读取,清除
'root': OFD.xml
"root_doc" Doc_0/Document.xml
xml_path : xml_obj
other_path : b64string
"""
def __init__(self, ofdb64:str):
self.ofdbyte = base64.b64decode(ofdb64)
pid=os.getpid()
self.name = f"{pid}_{str(uuid1())}.ofd"
self.pdf_name = self.name.replace(".ofd",".pdf")
self.zip_path = f"{os.getcwd()}/{self.name}"
self.unzip_path = ""
self.file_tree = {}
def unzip_file(self):
"""
:param zip_path: ofd格式文件路径
:param unzip_path: 解压后的文件存放目录
:return: unzip_path
"""
with open(self.zip_path,"wb") as f:
f.write(self.ofdbyte)
self.unzip_path = self.zip_path.split('.')[0]
with zipfile.ZipFile(self.zip_path, 'r') as f:
for file in f.namelist():
f.extract(file, path=self.unzip_path)
if self.save_xml:
print("saving xml {}".format(self.xml_name))
with zipfile.ZipFile(self.zip_path, 'r') as f:
for file in f.namelist():
f.extract(file, path=self.xml_name)
def buld_file_tree(self):
"xml读取对象其他b64"
self.file_tree["root"] = self.unzip_path
self.file_tree["pdf_name"] = self.pdf_name
for root, dirs, files in os.walk(self.unzip_path):
for file in files:
abs_path = os.path.join(root,file)
# 资源文件 则 b64 xml 则 xml——obj
self.file_tree[abs_path] = str(base64.b64encode(open(f"{abs_path}","rb").read()),"utf-8") \
if "xml" not in file else xmltodict.parse(open(f"{abs_path}" , "r", encoding="utf-8").read())
self.file_tree["root_doc"] = os.path.join(self.unzip_path,"OFD.xml") if os.path.join(self.unzip_path,"OFD.xml") in self.file_tree else ""
if os.path.exists(self.unzip_path):
shutil.rmtree(self.unzip_path)
if os.path.exists(self.zip_path):
os.remove(self.zip_path)
def __call__(self, *args: Any, **kwds: Any) -> Any:
self.save_xml=kwds.get("save_xml",False)
self.xml_name=kwds.get("xml_name")
self.unzip_file()
self.buld_file_tree()
return self.file_tree
if __name__ == "__main__":
with open(r"D:/code/easyofd/test/增值税电子专票5.ofd","rb") as f:
ofdb64 = str(base64.b64encode(f.read()),"utf-8")
a = FileRead(ofdb64)()
print(list(a.keys()))
\ No newline at end of file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: D:\code\easyofd\easyofd\parser
# CREATE_TIME: 2023-07-27
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: reno
# NOTE: 每种类型的文件定义一个解析器
import sys
sys.path.insert(0, "..")
import logging
import os
import traceback
import base64
import re
from typing import Any
from magic_pdf.tools.parameter_parser import ParameterParser
logger = logging.getLogger("root")
class FileParserBase(object):
"""xml解析"""
def __init__(self, xml_obj):
assert xml_obj
self.ofd_param = ParameterParser()
self.xml_obj = xml_obj
# print(xml_obj)
def recursion_ext(self, need_ext_obj, ext_list, key):
"""
抽取需要xml要素
need_ext_obj : xmltree
ext_list: data container
key: key
"""
if isinstance(need_ext_obj, dict):
for k, v in need_ext_obj.items():
if k == key:
if isinstance(v, (dict, str)):
ext_list.append(v)
elif isinstance(v, list):
ext_list.extend(v)
else:
if isinstance(v, dict):
self.recursion_ext(v, ext_list, key)
elif isinstance(v, list):
for cell in v:
self.recursion_ext(cell, ext_list, key)
else:
pass
else:
print(type(need_ext_obj))
class OFDFileParser(FileParserBase):
def __call__(self):
info = {}
# DocRoot
doc_root: list = []
doc_root_key = "ofd:DocRoot"
# print(self.xml_obj,doc_root)
self.recursion_ext(self.xml_obj, doc_root, doc_root_key)
info["doc_root"] = doc_root
signatures: list = []
signatures_key = "ofd:Signatures"
self.recursion_ext(self.xml_obj, signatures, signatures_key)
info["signatures"] = signatures
# ofd:Creator
creator: list = []
creator_key = "ofd:Creator"
self.recursion_ext(self.xml_obj, creator, creator_key)
info["creator"] = creator
# ofd:CreationDate
reation_date: list = []
creation_date_key = "ofd:CreationDate"
self.recursion_ext(self.xml_obj, reation_date, creation_date_key)
info["creationDate"] = reation_date
return info
class DocumentFileParser(FileParserBase):
"""
Document 为doc内的根节点 包含:
1 文件的路径 2 doc的size
"""
def loc2page_no(self, loc, idx):
pg_no = re.search(r"\d+", loc)
if pg_no:
pg_no = int(pg_no.group())
else:
pg_no = idx
return pg_no
def __call__(self):
document_info = {}
# size
physical_box: list = []
physical_box_key = "ofd:PhysicalBox"
self.recursion_ext(self.xml_obj, physical_box, physical_box_key)
document_info["size"] = physical_box[0] if physical_box else ""
# ofd:PublicRes路径 包含字体路径信息
public_res: list = []
public_res_key = "ofd:PublicRes"
self.recursion_ext(self.xml_obj, public_res, public_res_key)
document_info["public_res"] = public_res
# ofd:DocumentRes路径 包含静态资源图片
document_res: list = []
document_res_key = "ofd:DocumentRes"
self.recursion_ext(self.xml_obj, document_res, document_res_key)
document_info["document_res"] = document_res
# ofd:Page 正文
page: list = []
page_id_map = {}
apage_key = "ofd:Page"
self.recursion_ext(self.xml_obj, page, apage_key)
if page:
page_id_map = {
i.get("@ID"): self.loc2page_no(i.get("@BaseLoc"), idx)
for idx, i in enumerate(page)
}
page = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in page]
document_info["page"] = page
document_info["page_id_map"] = page_id_map
tpls: list = []
template_page_key = "ofd:TemplatePage"
self.recursion_ext(self.xml_obj, tpls, template_page_key)
if tpls:
tpls = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in tpls]
document_info["tpls"] = tpls
return document_info
class ContentFileParser(FileParserBase):
"""
Parser Contents&tpls
"""
def fetch_cell_info(self, row, TextObject):
"""fetch_cell_info"""
cell_d = {}
cell_d = {}
cell_d["ID"] = row['@ID'] # 字体
# 字体字形信息
if row.get("ofd:CGTransform"):
Glyphs_d = {
"Glyphs": row.get("ofd:CGTransform").get("ofd:Glyphs"),
"GlyphCount": row.get("ofd:CGTransform").get("@GlyphCount"),
"CodeCount": row.get("ofd:CGTransform").get("@CodeCount"),
"CodePosition": row.get("ofd:CGTransform").get("@CodePosition")
}
cell_d["Glyphs_d"] = Glyphs_d
cell_d["pos"] = [float(pos_i) for pos_i in row['@Boundary'].split(" ")] # 文本框
if row.get('ofd:Clips', {}).get('ofd:Clip', {}).get('ofd:Area', {}).get('ofd:Path', {}):
cell_d["clips_pos"] = [float(pos_i) for pos_i in
row.get('ofd:Clips', {}).get('ofd:Clip', {}).get('ofd:Area', {}).get('ofd:Path',
{}).get(
'@Boundary', "").split(" ")]
cell_d["text"] = str(TextObject.get('#text'))
cell_d["font"] = row['@Font'] # 字体
cell_d["size"] = float(row['@Size']) # 字号
# print("row", row)
color =self.ofd_param("ofd:FillColor", row).get("@Value", "0 0 0")
cell_d["color"] = tuple(color.split(" ")) # 颜色
cell_d["DeltaY"] = TextObject.get("@DeltaY", "") # y 轴偏移量 竖版文字表示方法之一
cell_d["DeltaX"] = TextObject.get("@DeltaX", "") # x 轴偏移量
cell_d["CTM"] = row.get("@CTM", "") # 平移矩阵换
cell_d["X"] = TextObject.get("@X", "") # X 文本之与文本框距离
cell_d["Y"] = TextObject.get("@Y", "") # Y 文本之与文本框距离
return cell_d
def __call__(self) -> list:
"""
输出主体坐标和文字信息 cell_list
[{"pos":row['@Boundary'].split(" "),
"text":row['ofd:TextCode'].get('#text'),
"font":row['@Font'],
"size":row['@Size'],}]
"""
text_list = []
img_list = []
line_list = []
content_d = {
"text_list": text_list,
"img_list": img_list,
"line_list": line_list,
}
text: list = [] # 正文
text_key = "ofd:TextObject"
self.recursion_ext(self.xml_obj, text, text_key)
if text:
for row in text:
# print("row", row.get('ofd:TextCode', {}))
if isinstance(row.get('ofd:TextCode', {}), list):
for _i in row.get('ofd:TextCode', {}):
if not _i.get('#text'):
continue
cell_d = self.fetch_cell_info(row, _i)
text_list.append(cell_d)
elif isinstance(row.get('ofd:TextCode', {}), dict):
if not row.get('ofd:TextCode', {}).get('#text'):
continue
cell_d = self.fetch_cell_info(row, row.get('ofd:TextCode', {}))
text_list.append(cell_d)
else:
logger.error(f"'ofd:TextCode' format nonsupport {row.get('ofd:TextCode', {})}")
continue
line: list = [] # 路径线条
line_key = "ofd:PathObject"
self.recursion_ext(self.xml_obj, line, line_key)
if line:
# print(line)
for _i in line:
line_d = {}
# print("line",_i)
line_d["ID"] = _i.get("@ID", "") # 图片id
line_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")] # 平移矩阵换
line_d["LineWidth"] = _i.get("@LineWidth", "") # 图片id
line_d["AbbreviatedData"] = _i.get("ofd:AbbreviatedData", "") # 路径指令
line_d["FillColor"] = self.ofd_param("ofd:FillColor", _i).get('@Value', "0 0 0").split(" ") # 颜色
line_d["StrokeColor"] = self.ofd_param("ofd:StrokeColor", _i).get('@Value', "0 0 0") # 颜色
line_list.append(line_d)
img: list = [] # 图片
img_key = "ofd:ImageObject"
self.recursion_ext(self.xml_obj, img, img_key)
if img:
for _i in img:
img_d = {}
img_d["CTM"] = _i.get("@CTM", "") # 平移矩阵换
img_d["ID"] = _i.get("ID", "") # 图片id
img_d["ResourceID"] = _i.get("@ResourceID", "") # 图片id
img_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")] # 平移矩阵换
img_list.append(img_d)
return content_d
class DocumentResFileParser(FileParserBase):
"""
Parser DocumentRes 抽取里面图片信息
"""
def __call__(self):
info = {}
muti_media: list = []
muti_media_key = "ofd:MultiMedia"
self.recursion_ext(self.xml_obj, muti_media, muti_media_key)
if muti_media:
for media in muti_media:
name = media.get("ofd:MediaFile", "")
info[media.get("@ID")] = {
"format": media.get("@Format", ""),
"wrap_pos": media.get("@wrap_pos", ""),
"type": media.get("@Type", ""),
"suffix": os.path.splitext(name)[-1].replace(".", ""), # 文件后缀名
"fileName": name,
}
return info
class PublicResFileParser(FileParserBase):
"""
Parser PublicRes 抽取里面 获取公共信息 字体信息
"""
def __call__(self):
info = {}
public_res: list = []
public_res_key = "ofd:Font"
self.recursion_ext(self.xml_obj, public_res, public_res_key)
if public_res:
for i in public_res:
info[i.get("@ID")] = {
"FontName": i.get("@FontName"),
"FamilyName": i.get("@FamilyName"),
"Bold": i.get("@Bold"),
"Serif": i.get("@Serif"),
"FixedWidth": i.get("@FixedWidth"),
"FontFile": i.get("ofd:FontFile"),
}
return info
class AnnotationFileParser(FileParserBase):
"""
Parser Annotation
签名信息 暂不用
"""
pass
class SignaturesFileParser(FileParserBase):
"""
Parser Signatures
签章信息-总
"""
def __call__(self):
info = {}
signature_res: list = []
signature_res_key = "ofd:Signature"
self.recursion_ext(self.xml_obj, signature_res, signature_res_key)
if signature_res:
for i in signature_res:
info[i.get("@ID")] = {
"BaseLoc": i.get("@BaseLoc"),
"Type": i.get("@Type"),
"ID": i.get("@ID"),
}
return info
class SignatureFileParser(FileParserBase):
"""
Parser Signature
签章信息
"""
def __call__(self, prefix=""):
info = {}
StampAnnot_res: list = []
StampAnnot_res_key = "ofd:StampAnnot"
self.recursion_ext(self.xml_obj, StampAnnot_res, StampAnnot_res_key)
SignedValue_res: list = []
SignedValue_res_key = "ofd:SignedValue"
self.recursion_ext(self.xml_obj, SignedValue_res, SignedValue_res_key)
# print("SignedValue_res", SignedValue_res)
# print("prefix", prefix)
if StampAnnot_res:
for i in StampAnnot_res:
info = {
"PageRef": i.get("@PageRef"), # page id
"Boundary": i.get("@Boundary"),
"ID": i.get("@ID"),
"SignedValue": f"{prefix}/{SignedValue_res[0]}" if SignedValue_res else f"{prefix}/SignedValue.dat",
}
return info
if __name__ == "__main__":
FileParserBase("")()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: easyofd read_seal_img
# CREATE_TIME: 2024/5/28 14:13
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: renoyuan
# note: 根据 ASN.1 解析签章 拿到 签章图片
import io
import base64
from PIL import Image, UnidentifiedImageError
from loguru import logger
from pyasn1.codec.der.decoder import decode
from pyasn1.type import univ
from pyasn1.error import PyAsn1Error
class SealExtract(object):
def __init__(self,):
pass
def read_signed_value(self, path="", b64=""):
# 读取二进制文件
if b64:
binary_data = base64.b64decode(b64)
elif path:
with open(path, 'rb') as file:
binary_data = file.read()
else:
return
# 尝试解码为通用的 ASN.1 结构
try:
decoded_data, _ = decode(binary_data)
except PyAsn1Error as e:
# print(f"Decoding failed: {e}")
decoded_data = None
finally:
return decoded_data
def find_octet_strings(self, asn1_data,octet_strings:list):
# 递归查找所有的 OctetString 实例
if isinstance(asn1_data, univ.OctetString):
octet_strings.append(asn1_data)
elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
for component in asn1_data:
self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
elif isinstance(asn1_data, univ.Choice):
self.find_octet_strings(asn1_data.getComponent(), octet_strings)
elif isinstance(asn1_data, univ.Any):
try:
sub_data, _ = decode(asn1_data.asOctets())
self.find_octet_strings(sub_data, octet_strings)
except PyAsn1Error:
pass
def hex_to_image(self, hex_data, image_format='PNG',inx=0):
"""
将16进制数据转换为图片并保存。
:param hex_data: 图片的16进制数据字符串
:param image_format: 图片的格式,默认为'PNG'
"""
# 将16进制数据转换为二进制数据
binary_data = bytes.fromhex(hex_data)
# 创建BytesIO对象以读取二进制数据
image_stream = io.BytesIO(binary_data)
# 使用Pillow打开图像数据并保存
try:
image = Image.open(image_stream)
# image.save(f'{inx}_image.{image_format}', format=image_format)
# print(f"图片已保存为'image.{image_format}'")
return image
except UnidentifiedImageError:
# logger.info("not img ")
pass
def __call__(self, path="", b64=""):
decoded_data = self.read_signed_value(path=path, b64=b64)
octet_strings = []
img_list = [] # 目前是只有一个的,若存在多个的话关联后面考虑
if decoded_data:
self.find_octet_strings(decoded_data, octet_strings)
for i, octet_string in enumerate(octet_strings):
# logger.info(f"octet_string{octet_string}")
if str(octet_string.prettyPrint()).startswith("0x"):
img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
if img:
logger.info("ASN.1 data found.")
img_list.append(img)
else:
logger.info("No valid ASN.1 data found.")
return img_list
if __name__=="__main__":
print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#PROJECT_NAME: D:\code\easyofd\easyofd
#CREATE_TIME: 2023-07-27
#E_MAIL: renoyuan@foxmail.com
#AUTHOR: reno
#NOTE: 字体处理
import time
import re
import json
import base64
import zipfile
import os
import shutil
import logging
from io import BytesIO, StringIO
import string
from uuid import uuid1
import random
import traceback
import logging
import tempfile
import xmltodict
from fontTools.ttLib import TTFont as ttLib_TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing, scale
from reportlab import platypus
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.units import mm,inch
from reportlab.platypus import SimpleDocTemplate, Image
from reportlab.lib.utils import ImageReader
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.cidfonts import UnicodeCIDFont
from reportlab.pdfbase.ttfonts import TTFont
from concurrent.futures import ThreadPoolExecutor
import threading
import multiprocessing
import PIL
from reportlab.lib.fonts import _tt2ps_map
from reportlab.lib.fonts import _family_alias
from loguru import logger
FONTS = ['宋体',"SWPMEH+SimSun",'SimSun','KaiTi','楷体',"STKAITI","SWLCQE+KaiTi",
'Courier New','STSong-Light',"CourierNew","SWANVV+CourierNewPSMT",
"CourierNewPSMT","BWSimKai","hei","黑体","SimHei","SWDKON+SimSun",
"SWCRMF+CourierNewPSMT","SWHGME+KaiTi"]
FONTS_ttf = [{'宋体':'simsun.ttc'},{'SimSun':'simsun.ttc'},{'KaiTi':'simkai.ttf'},{'楷体':'simkai.ttf'},
{'STKAITI':'STKAITI.TTF'},{'hei':'simhei.ttf'},{'STXihei':'simhei.ttf'},{'黑体':'simhei.ttf'},
{'华文细黑':'STXIHEI.TTF'},{'SimHei':'simhei.ttf'},{'Courier New':'/usr/share/fonts/truetype/msttcorefonts/cour.ttf'}]
path_prefix = "/usr/local/share/fonts/zit/"
# 批量注册字体
for font in FONTS_ttf:
for font_name, file_name in font.items():
try:
if 'usr' in file_name:
font_path = file_name
else:
font_path = os.path.join(path_prefix, file_name)
pdfmetrics.registerFont(TTFont(font_name, font_path))
except Exception as e:
print(f"无法注册字体 {font_name}: {e}")
class FontTool(object):
FONTS = FONTS
def __init__(self):
# 初始支持字体
# 字体检测
pass
def font_check(self):
logger.info("f{_tt2ps_map}")
logger.info("f{_family_alias}")
for font in self.FONTS:
if font in _tt2ps_map.values():
logger.info(f"已注册{font}")
else:
logger.warning(f"-{font}-未注册可能导致写入失败")
def register_font(self,file_name,FontName,font_b64):
if font_b64:
file_name = os.path.split(file_name)
# logger.error(f"file_name:{file_name}")
# logger.info(f"file_name:{file_name}")
if isinstance(file_name, (tuple, list)):
file_name = file_name[1]
if not FontName:
FontName = file_name.split(".")[0]
try:
with open(file_name, "wb") as f:
f.write(base64.b64decode(font_b64))
# print("FontName", FontName, "file_name", file_name)
pdfmetrics.registerFont(TTFont(FontName, file_name))
self.FONTS.append(FontName)
except Exception as e:
traceback.print_exc()
logger.error(f"register_font_error:\n{e} \n 包含不支持解析字体格式")
finally:
if os.path.exists(file_name):
os.remove(file_name)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: easyofd img_deal
# CREATE_TIME: 2024/7/18 11:20
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: renoyuan
# note: img 操作
from io import BytesIO
class DealImg(object):
def __init__(self):
pass
def resize(self):
"""resize img"""
pass
def pil2bytes(self, image):
"""pil2bytes"""
# 创建一个 BytesIO 对象
img_bytesio = BytesIO()
# 将图像保存到 BytesIO 对象
image.save(img_bytesio, format='PNG') # 你可以根据需要选择其他图像格式
# 获取 BytesIO 对象中的字节
img_bytes = img_bytesio.getvalue()
# 关闭 BytesIO 对象
img_bytesio.close()
return img_bytes
def pil2bytes_io(self, image):
"""pil2bytes_io"""
# 创建一个 BytesIO 对象
img_bytesio = BytesIO()
# 将图像保存到 BytesIO 对象
image.save(img_bytesio, format='PNG') # 你可以根据需要选择其他图像格式
return img_bytesio
# -*- coding: utf-8 -*-
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: F:\code\easyofd\easyofd
# CREATE_TIME: 2023-10-07
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: reno
# note: ofd 基础类
import base64
import os
import sys
from io import BytesIO
from typing import Union
sys.path.insert(0, os.getcwd())
sys.path.insert(0, "..")
import fitz
from PIL import Image
from loguru import logger
from magic_pdf.tools.ofd_parser import OFDParser
from magic_pdf.rw.draw_pdf import DrawPDF
from magic_pdf.rw.draw_ofd import OFDWrite
class OFD(object):
"""ofd对象"""
def __init__(self, ):
self.data = None
def read(self, ofd_f: Union[str, bytes, BytesIO], fmt="b64", save_xml=False, xml_name="testxml"):
"""_summary_
Args:
file (_type_): _description_
fomat (str, optional): _description_. Defaults to "path".
fomat in ("path","b64","binary")
"""
if fmt == "path":
with open(ofd_f, "rb") as f:
ofd_f = str(base64.b64encode(f.read()), encoding="utf-8")
elif fmt == "b64":
pass
elif fmt == "binary":
ofd_f = str(base64.b64encode(ofd_f), encoding="utf-8")
elif fmt == "io":
ofd_f = str(base64.b64encode(ofd_f.getvalue()), encoding="utf-8")
else:
raise "fomat Error: %s" % fmt
self.data = OFDParser(ofd_f)(save_xml=save_xml, xml_name=xml_name)
def save(self, ):
"""
draw ofd xml
初始化一个xml 文件
self.data > file
"""
assert self.data, f"data is None"
def pdf2ofd(self, pdfbyte, optional_text=False):
"""pdf转ofd"""
assert pdfbyte, f"pdfbyte is None"
# logger.info(f"pdf2ofd")
ofd_byte = OFDWrite()(pdfbyte, optional_text=optional_text)
return ofd_byte
def to_pdf(self, ):
"""return ofdbytes"""
assert self.data, f"data is None"
# logger.info(f"to_pdf")
return DrawPDF(self.data)()
def pdf2img(self, pdfbytes):
image_list = []
doc = fitz.open(stream=pdfbytes, filetype="pdf")
for page in doc:
rotate = int(0)
zoom_x, zoom_y = 1.6, 1.6
zoom_x, zoom_y = 2, 2
mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pix = page.get_pixmap(matrix=mat, alpha=False)
pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples)
# print(image.shape)
# print(image[2])
image_list.append(pil_image)
logger.info(f"pdf2img")
return image_list
def jpg2ofd(self, imglist: list):
"""
imglist: pil image list
"""
ofd_byte = OFDWrite()(pil_img_list=imglist)
return ofd_byte
def jpg2pfd(self, imglist: list):
"""
imglist: PIL image list
1 构建data
2 DrawPDF(self.data)()
"""
data = OFDParser(None).img2data(imglist)
return DrawPDF(data)()
def to_jpg(self, format="jpg"):
"""
return pil list
"""
assert self.data, f"data is None"
image_list = []
pdfbytes = self.to_pdf()
image_list = self.pdf2img(pdfbytes)
return image_list,pdfbytes
def del_data(self, ):
"""销毁self.data"""
self.data = None
def __del__(self):
del self
def disposal(self, ):
"""销毁对象"""
self.__del__()
if __name__ == "__main__":
ofd = OFD()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: D:\code\easyofd\easyofd\parser
# CREATE_TIME: 2023-07-27
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: reno
# NOTE: ofd解析主流程
import os
import sys
sys.path.insert(0, "..")
import traceback
import base64
import re
from typing import Any, List
from PIL import Image
from PIL.Image import Image as ImageClass
from loguru import logger
from magic_pdf.tools.img_deal import DealImg
from magic_pdf.tools.file_deal import FileRead
from magic_pdf.tools.file_parser import (OFDFileParser, DocumentFileParser, ContentFileParser, DocumentResFileParser,
PublicResFileParser,
SignaturesFileParser, SignatureFileParser)
class OFDParser(object):
"""
OFDParser 解析
1 解压文件 创建文件映射表 释放文件
2 解析 xml 逐级去 收集需要信息 结构文本 以及 资源
2 调用font 注册 字体
"""
def __init__(self, ofdb64):
self.img_deal = DealImg()
self.ofdb64 = ofdb64
self.file_tree = None
self.jbig2dec_path = r"C:/msys64/mingw64/bin/jbig2dec.exe"
def img2data(self, imglist: List[ImageClass]):
"""
imglist to ofd data
"""
OP = 200 / 25.4
doc_list = []
img_info = {}
page_size = []
font_info = {}
page_info_d = {}
for idx, img_pil in enumerate(imglist):
w, h = img_pil.size
img_bytes = self.img_deal.pil2bytes(img_pil)
imgb64 = str(base64.b64encode(img_bytes), encoding="utf-8")
img_info[str(idx)] = {
"format": "jpg",
"wrap_pos": "",
"type": "IMG",
"suffix": "jpg",
"fileName": f"{idx}.jpg",
"imgb64": imgb64,
}
text_list = []
img_list = []
img_d = {}
img_d["CTM"] = "" # 平移矩阵换 平移 缩放 旋转
img_d["ID"] = str(idx) # 图片id
img_d["ResourceID"] = str(idx) # 图片id
img_d["pos"] = [0, 0, w / OP, h / OP] # 平移矩阵换
page_size = [0, 0, w / OP, h / OP]
# print(page_size)
img_list.append(img_d)
content_d = {
"text_list": text_list,
"img_list": img_list,
}
page_info_d[idx] = content_d
doc_list.append({
"pdf_name": "demo.pdf",
"doc_no": "0",
"images": img_info,
"page_size": page_size,
"fonts": font_info,
"page_info": page_info_d
})
return doc_list
# 获得xml 对象
def get_xml_obj(self, label):
assert label
# print(self.file_tree.keys())
for abs_p in self.file_tree:
# 统一符号,避免win linux 路径冲突
abs_p_compare = abs_p.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
label_compare = label.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
if label_compare in abs_p_compare:
# logger.info(f"{label} {abs_p}")
return self.file_tree[abs_p]
# logger.info(f"{label} ofd file path is not")
return ""
def jb22png(self, img_d: dict):
"""
jb22png
没有安装 jbig2dec 无法操作
"""
if not os.path.exists(self.jbig2dec_path):
logger.warning(f"未安装jbig2dec,无法处理jb2文件")
return
# todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
fileName = img_d["fileName"]
new_fileName = img_d['fileName'].replace(".jb2", ".png")
with open(fileName, "wb") as f:
f.write(base64.b64decode(img_d["imgb64"]))
command = "{} -o {} {}"
res = os.system(command.format(self.jbig2dec_path, new_fileName, fileName))
if res != 0:
logger.warning(f"jbig2dec处理失败")
if os.path.exists(fileName):
os.remove(fileName)
if os.path.exists(new_fileName):
logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
img_d["fileName"] = new_fileName
img_d["suffix"] = "png"
img_d["format"] = "png"
with open(new_fileName, "rb") as f:
data = f.read()
img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
os.remove(new_fileName)
def bmp2jpg(self, img_d: dict):
fileName = img_d["fileName"]
new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
with open(fileName, "wb") as f:
f.write(base64.b64decode(img_d["imgb64"]))
# 打开 BMP 图像
bmp_image = Image.open(fileName)
# 将 BMP 图像保存为 JPG 格式
bmp_image.convert("RGB").save(new_fileName, "JPEG")
# 关闭图像
bmp_image.close()
if os.path.exists(new_fileName):
logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
img_d["fileName"] = new_fileName
img_d["suffix"] = "jpg"
img_d["format"] = "jpg"
with open(new_fileName, "rb") as f:
data = f.read()
img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
os.remove(new_fileName)
def parser(self, ):
"""
解析流程
"""
# 默认只有 doc_0 一层有多层后面改
page_size_details = []
default_page_size = []
doc_list = []
ofd_xml_obj = self.get_xml_obj(self.file_tree["root_doc"]) # OFD.xml xml 对象
if ofd_xml_obj:
ofd_obj_res = OFDFileParser(ofd_xml_obj)()
doc_root_name = ofd_obj_res.get("doc_root")
signatures = ofd_obj_res.get("signatures")
else:
# 考虑根节点丢失情况
doc_root_name = ["Doc_0/Document.xml"]
signatures = ["Doc_0/Signs/Signatures.xml"]
doc_root_xml_obj = self.get_xml_obj(doc_root_name[0])
doc_root_info = DocumentFileParser(doc_root_xml_obj)()
doc_size = doc_root_info.get("size")
if doc_size:
try:
default_page_size = [float(pos_i) for pos_i in doc_size.split(" ") if re.match("[\d\.]", pos_i)]
except:
traceback.print_exc()
# 字体信息
font_info = {}
public_res_name: list = doc_root_info.get("public_res")
if public_res_name:
public_xml_obj = self.get_xml_obj(public_res_name[0])
font_info = PublicResFileParser(public_xml_obj)()
# 注册字体
for font_id, font_v in font_info.items():
file_name = font_v.get("FontFile")
if file_name:
font_b64 = self.get_xml_obj(file_name)
if font_b64:
font_v["font_b64"] = font_b64
# 图片资源
img_info: dict = dict()
document_res_name: list = doc_root_info.get("document_res")
if document_res_name:
document_res_xml_obj = self.get_xml_obj(document_res_name[0])
img_info = DocumentResFileParser(document_res_xml_obj)()
# 找到图片b64
for img_id, img_v in img_info.items():
img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
if img_v[
"suffix"] == 'jb2': # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
self.jb22png(img_v)
if img_v["suffix"] == 'bmp':
self.bmp2jpg(img_v)
page_id_map: list = doc_root_info.get("page_id_map")
signatures_page_id = {}
# 签章信息
if signatures and (signatures_xml_obj := self.get_xml_obj(signatures[0])):
print("signatures_xml_obj", signatures,signatures_xml_obj)
signatures_info = SignaturesFileParser(signatures_xml_obj)()
if signatures_info: # 获取签章具体信息
for _, signatures_cell in signatures_info.items():
# print(signatures_info)
BaseLoc = signatures_cell.get("BaseLoc")
signature_xml_obj = self.get_xml_obj(BaseLoc)
# print(BaseLoc)
prefix = BaseLoc.split("/")[0]
signatures_info = SignatureFileParser(signature_xml_obj)(prefix=prefix)
# print(signatures_info)
print("signatures_info", signatures_info)
PageRef = signatures_info.get("PageRef")
Boundary = signatures_info.get("Boundary")
SignedValue = signatures_info.get("SignedValue")
sing_page_no = page_id_map.get(PageRef)
# print("self.file_tree",self.file_tree.keys)
# print(page_id_map,PageRef)
# print(SignedValue, self.get_xml_obj(SignedValue))
# with open("b64.txt","w") as f:
# f.write(self.get_xml_obj(SignedValue))
if signatures_page_id.get(sing_page_no):
signatures_page_id[sing_page_no].append(
{
"sing_page_no": sing_page_no,
"PageRef": PageRef,
"Boundary": Boundary,
"SignedValue": self.get_xml_obj(SignedValue),
}
)
else:
signatures_page_id[sing_page_no] = [
{
"sing_page_no": sing_page_no,
"PageRef": PageRef,
"Boundary": Boundary,
"SignedValue": self.get_xml_obj(SignedValue),
}
]
# 正文信息 会有多页 情况
page_name: list = doc_root_info.get("page")
page_info_d = {}
if page_name:
for index, _page in enumerate(page_name):
page_xml_obj = self.get_xml_obj(_page)
# 重新获取页面size
try:
page_size = [float(pos_i) for pos_i in
page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
"").split(" ")
if re.match("[\d\.]", pos_i)]
if page_size and len(page_size) >= 2:
page_size_details.append(page_size)
else:
page_size_details.append([])
except Exception as e:
traceback.print_exc()
page_size.append([])
page_info = ContentFileParser(page_xml_obj)()
pg_no = re.search(r"\d+", _page)
if pg_no:
pg_no = int(pg_no.group())
else:
pg_no = index
page_info_d[pg_no] = page_info
# 模板信息
tpls_name: list = doc_root_info.get("tpls")
if tpls_name:
for index, _tpl in enumerate(tpls_name):
tpl_xml_obj = self.get_xml_obj(_tpl)
tpl_info = ContentFileParser(tpl_xml_obj)()
tpl_no = re.search(r"\d+", _tpl)
if tpl_no:
tpl_no = int(tpl_no.group())
else:
tpl_no = index
if tpl_no in page_info_d:
page_info_d[pg_no]["text_list"].extend(tpl_info["text_list"])
page_info_d[pg_no]["text_list"].sort(
key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
page_info_d[pg_no]["img_list"].extend(tpl_info["img_list"])
page_info_d[pg_no]["img_list"].sort(
key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
page_info_d[pg_no]["line_list"].extend(tpl_info["line_list"])
page_info_d[pg_no]["line_list"].sort(
key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
else:
page_info_d[tpl_no] = tpl_info
page_info_d[tpl_no].sort(
key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
page_ID = 0 # 没遇到过doc多个的情况
# print("page_info",len(page_info))
doc_list.append({
"default_page_size": default_page_size,
"page_size": page_size_details,
"pdf_name": self.file_tree["pdf_name"],
"doc_no": page_ID,
"images": img_info,
"signatures_page_id": signatures_page_id,
"page_id_map": page_id_map,
"fonts": font_info,
"page_info": page_info_d
})
return doc_list
def __call__(self, *args: Any, **kwds: Any) -> Any:
"""
输出ofd解析结果
"""
save_xml = kwds.get("save_xml", False)
xml_name = kwds.get("xml_name")
self.file_tree = FileRead(self.ofdb64)(save_xml=save_xml, xml_name=xml_name)
# logger.info(self.file_tree)
return self.parser()
if __name__ == "__main__":
with open(r"E:\code\easyofd\test\增值税电子专票5.ofd", "rb") as f:
ofdb64 = str(base64.b64encode(f.read()), "utf-8")
print(OFDParser(ofdb64)())
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PROJECT_NAME: easyofd
# CREATE_TIME:
# E_MAIL: renoyuan@foxmail.com
# AUTHOR: renoyuan
# note:参数解析器
from loguru import logger
from typing import List, Dict, Any, Union, Tuple, Optional
class ParameterParser(object):
parameter = {
"ofd:FillColor": (dict, dict),
"ofd:StrokeColor": (dict, dict),
"ofd:Test": ((str, int), str),
"ofd:Font": (str, str),
"@Value": (str, str)
}
def __call__(self, key, container):
if key in ParameterParser.parameter:
v = container.get(key, None)
t = ParameterParser.parameter[key]
if isinstance(v, t[0]):
return v
else:
return t[1]()
else:
logger.warning(f"{key} not in ParameterParser")
return None
import os
from pathlib import Path
import click
from loguru import logger
from typing import List
from fastapi import FastAPI, HTTPException, Request
import magic_pdf.model as model_config
# from magic_pdf.dict2md.ocr_client import PredictClient
from magic_pdf.dict2md.ocr_vllm_client import PredictClient
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, parse_pdf_methods
from argparse import ArgumentParser
from pydantic import BaseModel
import uvicorn
import time
import configparser
#from magic_pdf.tools.config import update_config
app = FastAPI()
method = 'auto'
logger.add("parse.log", rotation="10 MB", level="INFO",
format="{time} {level} {message}", encoding='utf-8', enqueue=True)
config_path = None
ocr_status = None
custom_model = None
class ocrRequest(BaseModel):
path: str
output_dir: str
class ocrResponse(BaseModel):
status_code: int
output_path: str
def parse_args():
parser = ArgumentParser()
parser.add_argument(
'--dcu_id',
default='0',
help='设置DCU')
parser.add_argument(
'--method',
type=parse_pdf_methods,
help = """the method for parsing pdf.
ocr: using ocr technique to extract information from pdf.
txt: suitable for the text-based pdf only and outperform ocr.
auto: automatically choose the best method for parsing pdf from ocr and txt.
without method specified, auto will be used by default.""",
default = 'auto',
)
parser.add_argument(
'--debug',
type=bool,
help='Enables detailed debugging information during the execution of the CLI commands.',
default=False,
)
parser.add_argument(
'--config_path',
default='/home/practice/magic_pdf-main/magic_pdf/config.ini')
args = parser.parse_args()
return args
def ocr_pdf_serve(args: str):
os.environ["CUDA_VISIBLE_DEVICES"] = args.dcu_id
config = configparser.ConfigParser()
config.read(args.config_path)
# host = config.get('server', 'pdf_host')
# port = int(config.get('server', 'pdf_port'))
pdf_server = config.get('server', 'pdf_server').split('://')[1]
host, port = pdf_server.split(':')[0], int(pdf_server.split(':')[1])
global config_path
config_path = args.config_path
ocr_server = config.get('server', 'ocr_server')
ocr_client = PredictClient(ocr_server)
global ocr_status
ocr_status = ocr_client.check_health()
ocr = True
show_log = False
model_manager = ModelSingleton()
global custom_model
custom_model = model_manager.get_model(ocr, show_log)
uvicorn.run(app, host=host, port=port)
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.post("/pdf_ocr")
# def cli(path, output_dir, method, debug_able, start_page_id, end_page_id):
async def pdf_ocr(request: ocrRequest):
model_config.__use_inside_model__ = True
model_config.__model_mode__ = 'full'
output_dir = request.output_dir
path = request.path
#config_path = request.config_path
os.makedirs(output_dir, exist_ok=True)
debug_able = False
start_page_id = 0
end_page_id = None
logger.info(f"正在处理文件: {path}")
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
def parse_doc(doc_path: str, config_path: str):
try:
file_name = str(Path(doc_path).stem)
pdf_data = read_fn(doc_path)
output_path = do_parse(
ocr_status,
config_path,
output_dir,
file_name,
pdf_data,
[],
method,
debug_able,
model=custom_model,
start_page_id=start_page_id,
end_page_id=end_page_id,
)
# logger.info(f'文件解析成功:{output_path}')
return output_path
except Exception as e:
logger.exception(e)
# logger.info(f'config_path:{config_path}')
output_path = parse_doc(path,config_path)
if output_path:
logger.info(f'文件解析成功:{output_path}')
return {"status_code": 200, "output_path": output_path}
else:
logger.error(f'文件解析失败,文件为:{path}')
raise HTTPException(status_code=500)
def main():
args = parse_args()
ocr_pdf_serve(args)
if __name__ == '__main__':
main()
"""
用户输入:
model数组,每个元素代表一个页面
pdf在s3的路径
截图保存的s3位置
然后:
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
"""
import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
PARSE_TYPE_TXT = "txt"
PARSE_TYPE_OCR = "ocr"
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
"""
解析文本类pdf
"""
pdf_info_dict = parse_pdf_by_txt(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
if lang is not None:
pdf_info_dict["_lang"] = lang
return pdf_info_dict
def parse_ocr_pdf(ocr_status,config_path,local_image_dir,pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
"""
解析ocr类pdf
"""
pdf_info_dict = parse_pdf_by_ocr(ocr_status,config_path,local_image_dir,
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
pdf_info_dict["_version_name"] = __version__
if lang is not None:
pdf_info_dict["_lang"] = lang
return pdf_info_dict
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
input_model_is_empty: bool = False,
start_page_id=0, end_page_id=None, lang=None,
*args, **kwargs):
"""
ocr和文本混合的pdf,全部解析出来
"""
def parse_pdf(method):
try:
return method(
pdf_bytes,
pdf_models,
imageWriter,
start_page_id=start_page_id,
end_page_id=end_page_id,
debug_mode=is_debug,
)
except Exception as e:
logger.exception(e)
return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
if input_model_is_empty:
layout_model = kwargs.get("layout_model", None)
formula_enable = kwargs.get("formula_enable", None)
table_enable = kwargs.get("table_enable", None)
pdf_models = doc_analyze(
pdf_bytes,
ocr=True,
start_page_id=start_page_id,
end_page_id=end_page_id,
lang=lang,
layout_model=layout_model,
formula_enable=formula_enable,
table_enable=table_enable,
)
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
else:
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
pdf_info_dict["_version_name"] = __version__
if lang is not None:
pdf_info_dict["_lang"] = lang
return pdf_info_dict
from loguru import logger
def ImportPIL(f):
try:
import PIL # noqa: F401
except ImportError:
logger.error('Pillow not installed, please install by pip.')
exit(1)
return f
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment