"tests/pipelines/vscode:/vscode.git/clone" did not exist on "013955b5a72d25836c519d6682d8708330351965"
Commit c9171d1f authored by zhougaofeng's avatar zhougaofeng
Browse files

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json,...

Update app.py, count_pdfs.py, LICENSE.md, magic-pdf.template.json, requirements.txt, requirements-docker.txt, requirements-qa.txt, update_version.py, setup.py, magic_pdf/__init__.py, magic_pdf/pdf_parse_by_txt.py, magic_pdf/pdf_parse_by_ocr.py, magic_pdf/user_api.py, magic_pdf/pdf_parse_union_core.py, magic_pdf/__pycache__/pdf_parse_by_ocr.cpython-310.pyc, magic_pdf/__pycache__/__init__.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_by_txt.cpython-310.pyc, magic_pdf/__pycache__/pdf_parse_union_core.cpython-310.pyc, magic_pdf/__pycache__/user_api.cpython-310.pyc, magic_pdf/dict2md/__init__.py, magic_pdf/dict2md/mkcontent.py, magic_pdf/dict2md/ocr_mkcontent.py, magic_pdf/dict2md/ocr_client.py, magic_pdf/dict2md/ocr_server.py, magic_pdf/dict2md/ocr_server_72.py, magic_pdf/dict2md/tmp.py, magic_pdf/dict2md/__pycache__/__init__.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_client.cpython-310.pyc, magic_pdf/dict2md/__pycache__/ocr_mkcontent.cpython-310.pyc, magic_pdf/filter/__init__.py, magic_pdf/filter/pdf_classify_by_type.py, magic_pdf/filter/pdf_meta_scan.py, magic_pdf/integrations/__init__.py, magic_pdf/integrations/rag/__init__.py, magic_pdf/integrations/rag/api.py, magic_pdf/integrations/rag/type.py, magic_pdf/integrations/rag/utils.py, magic_pdf/layout/__init__.py, magic_pdf/layout/bbox_sort.py, magic_pdf/layout/layout_det_utils.py, magic_pdf/layout/layout_sort.py, magic_pdf/layout/layout_spiler_recog.py, magic_pdf/layout/mcol_sort.py, magic_pdf/libs/__init__.py, magic_pdf/libs/boxbase.py, magic_pdf/libs/calc_span_stats.py, magic_pdf/libs/commons.py, magic_pdf/libs/config_reader.py, magic_pdf/libs/Constants.py, magic_pdf/libs/convert_utils.py, magic_pdf/libs/coordinate_transform.py, magic_pdf/libs/detect_language_from_model.py, magic_pdf/libs/draw_bbox.py, magic_pdf/libs/drop_reason.py, magic_pdf/libs/drop_tag.py, magic_pdf/libs/hash_utils.py, magic_pdf/libs/json_compressor.py, magic_pdf/libs/language.py, magic_pdf/libs/local_math.py, magic_pdf/libs/MakeContentConfig.py, magic_pdf/libs/markdown_utils.py, magic_pdf/libs/nlp_utils.py, magic_pdf/libs/ModelBlockTypeEnum.py, magic_pdf/libs/ocr_content_type.py, magic_pdf/libs/path_utils.py, magic_pdf/libs/pdf_check.py, magic_pdf/libs/pdf_image_tools.py, magic_pdf/libs/safe_filename.py, magic_pdf/libs/textbase.py, magic_pdf/libs/version.py, magic_pdf/libs/vis_utils.py, magic_pdf/model/__init__.py, magic_pdf/model/doc_analyze_by_custom_model.py, magic_pdf/model/magic_model.py, magic_pdf/model/pdf_extract_kit.py, magic_pdf/model/model_list.py, magic_pdf/model/pp_structure_v2.py, magic_pdf/model/ppTableModel.py, magic_pdf/model/pek_sub_modules/__init__.py, magic_pdf/model/pek_sub_modules/post_process.py, magic_pdf/model/pek_sub_modules/self_modify.py, magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py, magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py, magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py, magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py, magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py, magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py, magic_pdf/model/pek_sub_modules/structeqtable/__init__.py, magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py, magic_pdf/para/__init__.py, magic_pdf/para/block_continuation_processor.py, magic_pdf/para/block_termination_processor.py, magic_pdf/para/commons.py, magic_pdf/para/denoise.py, magic_pdf/para/draw.py, magic_pdf/para/exceptions.py, magic_pdf/para/layout_match_processor.py, magic_pdf/para/para_pipeline.py, magic_pdf/para/para_split.py, magic_pdf/para/para_split_v2.py, magic_pdf/para/raw_processor.py, magic_pdf/para/stats.py, magic_pdf/para/title_processor.py, magic_pdf/parse/__init__.py, magic_pdf/parse/common_parse.py, magic_pdf/parse/excel_parse.py, magic_pdf/parse/pdf_client.py, magic_pdf/pipe/__init__.py, magic_pdf/pipe/AbsPipe.py, magic_pdf/pipe/OCRPipe.py, magic_pdf/pipe/TXTPipe.py, magic_pdf/pipe/UNIPipe.py, magic_pdf/post_proc/__init__.py, magic_pdf/post_proc/pdf_post_filter.py, magic_pdf/post_proc/remove_footnote.py, magic_pdf/post_proc/detect_para.py, magic_pdf/pre_proc/__init__.py, magic_pdf/pre_proc/citationmarker_remove.py, magic_pdf/pre_proc/construct_page_dict.py, magic_pdf/pre_proc/cut_image.py, magic_pdf/pre_proc/detect_equation.py, magic_pdf/pre_proc/detect_footer_header_by_statistics.py, magic_pdf/pre_proc/detect_footer_by_model.py, magic_pdf/pre_proc/detect_footnote.py, magic_pdf/pre_proc/detect_header.py, magic_pdf/pre_proc/detect_images.py, magic_pdf/pre_proc/detect_page_number.py, magic_pdf/pre_proc/detect_tables.py, magic_pdf/pre_proc/equations_replace.py, magic_pdf/pre_proc/fix_image.py, magic_pdf/pre_proc/fix_table.py, magic_pdf/pre_proc/main_text_font.py, magic_pdf/pre_proc/ocr_detect_all_bboxes.py, magic_pdf/pre_proc/ocr_detect_layout.py, magic_pdf/pre_proc/ocr_dict_merge.py, magic_pdf/pre_proc/ocr_span_list_modify.py, magic_pdf/pre_proc/post_layout_split.py, magic_pdf/pre_proc/remove_bbox_overlap.py, magic_pdf/pre_proc/remove_colored_strip_bbox.py, magic_pdf/pre_proc/pdf_pre_filter.py, magic_pdf/pre_proc/remove_footer_header.py, magic_pdf/pre_proc/remove_rotate_bbox.py, magic_pdf/pre_proc/resolve_bbox_conflict.py, magic_pdf/pre_proc/solve_line_alien.py, magic_pdf/pre_proc/statistics.py, magic_pdf/resources/fasttext-langdetect/lid.176.ftz, magic_pdf/resources/model_config/model_configs.yaml, magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml, magic_pdf/resources/model_config/UniMERNet/demo.yaml, magic_pdf/rw/__init__.py, magic_pdf/rw/AbsReaderWriter.py, magic_pdf/rw/DiskReaderWriter.py, magic_pdf/rw/S3ReaderWriter.py, magic_pdf/spark/__init__.py, magic_pdf/spark/spark_api.py, magic_pdf/tools/__init__.py, magic_pdf/tools/pdf_client.py, magic_pdf/tools/common.py, magic_pdf/tools/cli_dev.py, magic_pdf/tools/cli.py, magic_pdf/tools/pdf_server.py files
parent 748e3b56
Pipeline #1783 canceled with stages
import datetime
import json
import os, re, configparser
import subprocess
import time
import boto3
from loguru import logger
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
import fitz # 1.23.9中已经切换到rebase
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
def get_delta_time(input_time):
return round(time.time() - input_time, 2)
def join_path(*args):
return '/'.join(str(s).rstrip('/') for s in args)
#配置全局的errlog_path,方便demo同步引用
error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path = "s3://llm-pdf-text/json_dump/"
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
def get_top_percent_list(num_list, percent):
"""
获取列表中前百分之多少的元素
:param num_list:
:param percent:
:return:
"""
if len(num_list) == 0:
top_percent_list = []
else:
# 对imgs_len_list排序
sorted_imgs_len_list = sorted(num_list, reverse=True)
# 计算 percent 的索引
top_percent_index = int(len(sorted_imgs_len_list) * percent)
# 取前80%的元素
top_percent_list = sorted_imgs_len_list[:top_percent_index]
return top_percent_list
def formatted_time(time_stamp):
dt_object = datetime.datetime.fromtimestamp(time_stamp)
output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
return output_time
def mymax(alist: list):
if len(alist) == 0:
return 0 # 空是0, 0*0也是0大小q
else:
return max(alist)
def parse_aws_param(profile):
if isinstance(profile, str):
# 解析配置文件
config_file = join_path(os.path.expanduser("~"), ".aws", "config")
credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
config = configparser.ConfigParser()
config.read(credentials_file)
config.read(config_file)
# 获取 AWS 账户相关信息
ak = config.get(profile, "aws_access_key_id")
sk = config.get(profile, "aws_secret_access_key")
if profile == "default":
s3_str = config.get(f"{profile}", "s3")
else:
s3_str = config.get(f"profile {profile}", "s3")
end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
if end_match:
endpoint = end_match.group(1)
else:
raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
if style_match:
addressing_style = style_match.group(1)
else:
addressing_style = "path"
elif isinstance(profile, dict):
ak = profile["ak"]
sk = profile["sk"]
endpoint = profile["endpoint"]
addressing_style = "auto"
return ak, sk, endpoint, addressing_style
def parse_bucket_key(s3_full_path: str):
"""
输入 s3://bucket/path/to/my/file.txt
输出 bucket, path/to/my/file.txt
"""
s3_full_path = s3_full_path.strip()
if s3_full_path.startswith("s3://"):
s3_full_path = s3_full_path[5:]
if s3_full_path.startswith("/"):
s3_full_path = s3_full_path[1:]
bucket, key = s3_full_path.split("/", 1)
return bucket, key
def read_file(pdf_path: str, s3_profile):
if pdf_path.startswith("s3://"):
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
bucket_name, bucket_key = parse_bucket_key(pdf_path)
res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
file_content = res["Body"].read()
return file_content
else:
with open(pdf_path, "rb") as f:
return f.read()
def get_docx_model_output(pdf_model_output, page_id):
model_output_json = pdf_model_output[page_id]
return model_output_json
def list_dir(dir_path:str, s3_profile:str):
"""
列出dir_path下的所有文件
"""
ret = []
if dir_path.startswith("s3"):
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
bucket, path = s3info[0][0], s3info[0][1]
try:
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
config=Config(s3={'addressing_style': addressing_style}))
def list_obj_scluster():
marker = None
while True:
list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
if marker:
list_kwargs['Marker'] = marker
response = cli.list_objects(**list_kwargs)
contents = response.get("Contents", [])
yield from contents
if not response.get("IsTruncated") or len(contents)==0:
break
marker = contents[-1]['Key']
for info in list_obj_scluster():
file_path = info['Key']
#size = info['Size']
if path!="":
afile = file_path[len(path):]
if afile.endswith(".json"):
ret.append(f"s3://{bucket}/{file_path}")
return ret
except Exception as e:
logger.exception(e)
exit(-1)
else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(".json"):
ret.append(join_path(root, file))
ret.sort()
return ret
def get_img_s3_client(save_path:str, image_s3_config:str):
"""
"""
if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
img_s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=end_point,
config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
)
else:
img_s3_client = None
return img_s3_client
if __name__=="__main__":
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
s3_profile = "langchao"
ret = list_dir(s3_path, s3_profile)
print(ret)
\ No newline at end of file
"""
根据bucket的名字返回对应的s3 AK, SK,endpoint三元组
"""
import json
import os
from loguru import logger
from magic_pdf.libs.commons import parse_bucket_key
# 定义配置文件名常量
CONFIG_FILE_NAME = "magic-pdf.json"
def read_config():
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
if not os.path.exists(config_file):
raise FileNotFoundError(f"{config_file} not found")
with open(config_file, "r", encoding="utf-8") as f:
config = json.load(f)
return config
def get_s3_config(bucket_name: str):
"""
~/magic-pdf.json 读出来
"""
config = read_config()
bucket_info = config.get("bucket_info")
if bucket_name not in bucket_info:
access_key, secret_key, storage_endpoint = bucket_info["[default]"]
else:
access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
if access_key is None or secret_key is None or storage_endpoint is None:
raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
# logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
return access_key, secret_key, storage_endpoint
def get_s3_config_dict(path: str):
access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
def get_bucket_name(path):
bucket, key = parse_bucket_key(path)
return bucket
def get_local_models_dir():
config = read_config()
models_dir = config.get("models-dir")
if models_dir is None:
logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
return "/tmp/models"
else:
return models_dir
def get_device():
config = read_config()
device = config.get("device-mode")
if device is None:
logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
return "cpu"
else:
return device
def get_table_recog_config():
config = read_config()
table_config = config.get("table-config")
if table_config is None:
logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
else:
return table_config
if __name__ == "__main__":
ak, sk, endpoint = get_s3_config("llm-raw")
def dict_to_list(input_dict):
items_list = []
for _, item in input_dict.items():
items_list.append(item)
return items_list
def get_scale_ratio(model_page_info, page):
pix = page.get_pixmap(dpi=72)
pymu_width = int(pix.w)
pymu_height = int(pix.h)
width_from_json = model_page_info['page_info']['width']
height_from_json = model_page_info['page_info']['height']
horizontal_scale_ratio = width_from_json / pymu_width
vertical_scale_ratio = height_from_json / pymu_height
return horizontal_scale_ratio, vertical_scale_ratio
from collections import Counter
from magic_pdf.libs.language import detect_lang
def get_language_from_model(model_list: list):
language_lst = []
for ocr_page_info in model_list:
page_text = ""
layout_dets = ocr_page_info["layout_dets"]
for layout_det in layout_dets:
category_id = layout_det["category_id"]
allow_category_id_list = [15]
if category_id in allow_category_id_list:
page_text += layout_det["text"]
page_language = detect_lang(page_text)
language_lst.append(page_language)
# 统计text_language_list中每种语言的个数
count_dict = Counter(language_lst)
# 输出text_language_list中出现的次数最多的语言
language = max(count_dict, key=count_dict.get)
return language
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
from magic_pdf.model.magic_model import MagicModel
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
new_rgb.append(item)
page_data = bbox_list[i]
for bbox in page_data:
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
if fill_config:
page.draw_rect(
rect_coords,
color=None,
fill=new_rgb,
fill_opacity=0.3,
width=0.5,
overlay=True,
) # Draw the rectangle
else:
page.draw_rect(
rect_coords,
color=new_rgb,
fill=None,
fill_opacity=1,
width=0.5,
overlay=True,
) # Draw the rectangle
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
new_rgb.append(item)
page_data = bbox_list[i]
for j, bbox in enumerate(page_data):
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
if fill_config:
page.draw_rect(
rect_coords,
color=None,
fill=new_rgb,
fill_opacity=0.3,
width=0.5,
overlay=True,
) # Draw the rectangle
else:
page.draw_rect(
rect_coords,
color=new_rgb,
fill=None,
fill_opacity=1,
width=0.5,
overlay=True,
) # Draw the rectangle
page.insert_text(
(x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
) # Insert the index in the top left corner of the rectangle
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
layout_bbox_list = []
dropped_bbox_list = []
tables_list, tables_body_list = [], []
tables_caption_list, tables_footnote_list = [], []
imgs_list, imgs_body_list, imgs_caption_list = [], [], []
imgs_footnote_list = []
titles_list = []
texts_list = []
interequations_list = []
for page in pdf_info:
page_layout_list = []
page_dropped_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
titles = []
texts = []
interequations = []
for layout in page['layout_bboxes']:
page_layout_list.append(layout['layout_bbox'])
layout_bbox_list.append(page_layout_list)
for dropped_bbox in page['discarded_blocks']:
page_dropped_list.append(dropped_bbox['bbox'])
dropped_bbox_list.append(page_dropped_list)
for block in page['para_blocks']:
bbox = block['bbox']
if block['type'] == BlockType.Table:
tables.append(bbox)
for nested_block in block['blocks']:
bbox = nested_block['bbox']
if nested_block['type'] == BlockType.TableBody:
tables_body.append(bbox)
elif nested_block['type'] == BlockType.TableCaption:
tables_caption.append(bbox)
elif nested_block['type'] == BlockType.TableFootnote:
tables_footnote.append(bbox)
elif block['type'] == BlockType.Image:
imgs.append(bbox)
for nested_block in block['blocks']:
bbox = nested_block['bbox']
if nested_block['type'] == BlockType.ImageBody:
imgs_body.append(bbox)
elif nested_block['type'] == BlockType.ImageCaption:
imgs_caption.append(bbox)
elif nested_block['type'] == BlockType.ImageFootnote:
imgs_footnote.append(bbox)
elif block['type'] == BlockType.Title:
titles.append(bbox)
elif block['type'] == BlockType.Text:
texts.append(bbox)
elif block['type'] == BlockType.InterlineEquation:
interequations.append(bbox)
tables_list.append(tables)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
tables_footnote_list.append(tables_footnote)
imgs_list.append(imgs)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
True)
draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
True) # color !
draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
True)
draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
True)
draw_bbox_without_number(i, tables_footnote_list, page,
[229, 255, 204], True)
draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True),
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
True)
# Save the PDF
pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
text_list = []
inline_equation_list = []
interline_equation_list = []
image_list = []
table_list = []
dropped_list = []
next_page_text_list = []
next_page_inline_equation_list = []
def get_span_info(span):
if span['type'] == ContentType.Text:
if span.get(CROSS_PAGE, False):
next_page_text_list.append(span['bbox'])
else:
page_text_list.append(span['bbox'])
elif span['type'] == ContentType.InlineEquation:
if span.get(CROSS_PAGE, False):
next_page_inline_equation_list.append(span['bbox'])
else:
page_inline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.InterlineEquation:
page_interline_equation_list.append(span['bbox'])
elif span['type'] == ContentType.Image:
page_image_list.append(span['bbox'])
elif span['type'] == ContentType.Table:
page_table_list.append(span['bbox'])
for page in pdf_info:
page_text_list = []
page_inline_equation_list = []
page_interline_equation_list = []
page_image_list = []
page_table_list = []
page_dropped_list = []
# 将跨页的span放到移动到下一页的列表中
if len(next_page_text_list) > 0:
page_text_list.extend(next_page_text_list)
next_page_text_list.clear()
if len(next_page_inline_equation_list) > 0:
page_inline_equation_list.extend(next_page_inline_equation_list)
next_page_inline_equation_list.clear()
# 构造dropped_list
for block in page['discarded_blocks']:
if block['type'] == BlockType.Discarded:
for line in block['lines']:
for span in line['spans']:
page_dropped_list.append(span['bbox'])
dropped_list.append(page_dropped_list)
# 构造其余useful_list
for block in page['para_blocks']:
if block['type'] in [
BlockType.Text,
BlockType.Title,
BlockType.InterlineEquation,
]:
for line in block['lines']:
for span in line['spans']:
get_span_info(span)
elif block['type'] in [BlockType.Image, BlockType.Table]:
for sub_block in block['blocks']:
for line in sub_block['lines']:
for span in line['spans']:
get_span_info(span)
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list)
image_list.append(page_image_list)
table_list.append(page_table_list)
pdf_docs = fitz.open('pdf', pdf_bytes)
for i, page in enumerate(pdf_docs):
# 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
False)
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
False)
draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
# Save the PDF
pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
dropped_bbox_list = []
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
titles_list = []
texts_list = []
interequations_list = []
pdf_docs = fitz.open('pdf', pdf_bytes)
magic_model = MagicModel(model_list, pdf_docs)
for i in range(len(model_list)):
page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], []
imgs_body, imgs_caption, imgs_footnote = [], [], []
titles = []
texts = []
interequations = []
page_info = magic_model.get_model_list(i)
layout_dets = page_info['layout_dets']
for layout_det in layout_dets:
bbox = layout_det['bbox']
if layout_det['category_id'] == CategoryId.Text:
texts.append(bbox)
elif layout_det['category_id'] == CategoryId.Title:
titles.append(bbox)
elif layout_det['category_id'] == CategoryId.TableBody:
tables_body.append(bbox)
elif layout_det['category_id'] == CategoryId.TableCaption:
tables_caption.append(bbox)
elif layout_det['category_id'] == CategoryId.TableFootnote:
tables_footnote.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageBody:
imgs_body.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageCaption:
imgs_caption.append(bbox)
elif layout_det[
'category_id'] == CategoryId.InterlineEquation_YOLO:
interequations.append(bbox)
elif layout_det['category_id'] == CategoryId.Abandon:
page_dropped_list.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageFootnote:
imgs_footnote.append(bbox)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
tables_footnote_list.append(tables_footnote)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
dropped_bbox_list.append(page_dropped_list)
imgs_footnote_list.append(imgs_footnote)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
True) # color !
draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
True)
draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
True)
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True)
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
# Save the PDF
pdf_docs.save(f'{out_path}/{filename}_model.pdf')
class DropReason:
TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大
MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
Exception = "_exception" # 解析中发生异常
ENCRYPTED = "encrypted" # PDF是加密的
EMPTY_PDF = "total_page=0" # PDF页面总数为0
NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析
DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题)
PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
SPECIAL_PDF = "special_pdf"
PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
\ No newline at end of file
COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
PAGE_NO = "page-no" # 页码
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
VERTICAL_TEXT = 'vertical-text' # 垂直文本
ROTATE_TEXT = 'rotate-text' # 旋转文本
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
class DropTag:
PAGE_NUMBER = "page_no"
HEADER = "header"
FOOTER = "footer"
FOOTNOTE = "footnote"
NOT_IN_LAYOUT = "not_in_layout"
SPAN_OVERLAP = "span_overlap"
BLOCK_OVERLAP = "block_overlap"
import hashlib
def compute_md5(file_bytes):
hasher = hashlib.md5()
hasher.update(file_bytes)
return hasher.hexdigest().upper()
def compute_sha256(input_string):
hasher = hashlib.sha256()
# 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理
input_bytes = input_string.encode('utf-8')
hasher.update(input_bytes)
return hasher.hexdigest()
import json
import brotli
import base64
class JsonCompressor:
@staticmethod
def compress_json(data):
"""
Compress a json object and encode it with base64
"""
json_str = json.dumps(data)
json_bytes = json_str.encode('utf-8')
compressed = brotli.compress(json_bytes, quality=6)
compressed_str = base64.b64encode(compressed).decode('utf-8') # convert bytes to string
return compressed_str
@staticmethod
def decompress_json(compressed_str):
"""
Decode the base64 string and decompress the json object
"""
compressed = base64.b64decode(compressed_str.encode('utf-8')) # convert string to bytes
decompressed_bytes = brotli.decompress(compressed)
json_str = decompressed_bytes.decode('utf-8')
data = json.loads(json_str)
return data
import os
import unicodedata
if not os.getenv("FTLANG_CACHE"):
current_file_path = os.path.abspath(__file__)
current_dir = os.path.dirname(current_file_path)
root_dir = os.path.dirname(current_dir)
ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
# print(os.getenv("FTLANG_CACHE"))
from fast_langdetect import detect_language
def detect_lang(text: str) -> str:
if len(text) == 0:
return ""
try:
lang_upper = detect_language(text)
except:
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_language(html_no_ctrl_chars)
try:
lang = lang_upper.lower()
except:
lang = ""
return lang
if __name__ == '__main__':
print(os.getenv("FTLANG_CACHE"))
print(detect_lang("This is a test."))
print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>"))
def float_gt(a, b):
if 0.0001 >= abs(a -b):
return False
return a > b
def float_equal(a, b):
if 0.0001 >= abs(a-b):
return True
return False
\ No newline at end of file
import re
def escape_special_markdown_char(pymu_blocks):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for blk in pymu_blocks:
for line in blk['lines']:
for span in line['spans']:
for char in special_chars:
span_text = span['text']
span_type = span.get("_type", None)
if span_type in ['inline-equation', 'interline-equation']:
continue
elif span_text:
span['text'] = span['text'].replace(char, "\\" + char)
return pymu_blocks
def ocr_escape_special_markdown_char(content):
"""
转义正文里对markdown语法有特殊意义的字符
"""
special_chars = ["*", "`", "~", "$"]
for char in special_chars:
content = content.replace(char, "\\" + char)
return content
import re
from os import path
from collections import Counter
from loguru import logger
# from langdetect import detect
import spacy
import en_core_web_sm
import zh_core_web_sm
from magic_pdf.libs.language import detect_lang
class NLPModels:
"""
How to upload local models to s3:
- config aws cli:
doc\SETUP-CLI.md
doc\setup_cli.sh
app\config\__init__.py
- $ cd {local_dir_storing_models}
- $ ls models
en_core_web_sm-3.7.1/
zh_core_web_sm-3.7.0/
- $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
- $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
PRE en_core_web_sm-3.7.1/
PRE zh_core_web_sm-3.7.0/
"""
def __init__(self):
# if OS is windows, set "TMP_DIR" to "D:/tmp"
home_dir = path.expanduser("~")
self.default_local_path = path.join(home_dir, ".nlp_models")
self.default_shared_path = "/share/pdf_processor/nlp_models"
self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
self.default_s3_path = "s3://llm-infra/models"
self.nlp_models = self.nlp_models = {
"en_core_web_sm": {
"type": "spacy",
"version": "3.7.1",
},
"en_core_web_md": {
"type": "spacy",
"version": "3.7.1",
},
"en_core_web_lg": {
"type": "spacy",
"version": "3.7.1",
},
"zh_core_web_sm": {
"type": "spacy",
"version": "3.7.0",
},
"zh_core_web_md": {
"type": "spacy",
"version": "3.7.0",
},
"zh_core_web_lg": {
"type": "spacy",
"version": "3.7.0",
},
}
self.en_core_web_sm_model = en_core_web_sm.load()
self.zh_core_web_sm_model = zh_core_web_sm.load()
def load_model(self, model_name, model_type, model_version):
if (
model_name in self.nlp_models
and self.nlp_models[model_name]["type"] == model_type
and self.nlp_models[model_name]["version"] == model_version
):
return spacy.load(model_name) if spacy.util.is_package(model_name) else None
else:
logger.error(f"Unsupported model name or version: {model_name} {model_version}")
return None
def detect_language(self, text, use_langdetect=False):
if len(text) == 0:
return None
if use_langdetect:
# print("use_langdetect")
# print(detect_lang(text))
# return detect_lang(text)
if detect_lang(text) == "zh":
return "zh"
else:
return "en"
if not use_langdetect:
en_count = len(re.findall(r"[a-zA-Z]", text))
cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
if en_count > cn_count:
return "en"
if cn_count > en_count:
return "zh"
def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
"""
Detect entity categories using NLP models and return the most frequent entity types.
Parameters
----------
text : str
Text to be processed.
Returns
-------
str
The most frequent entity type.
"""
lang = self.detect_language(text, use_langdetect=True)
if lang == "en":
nlp_model = self.en_core_web_sm_model
elif lang == "zh":
nlp_model = self.zh_core_web_sm_model
else:
# logger.error(f"Unsupported language: {lang}")
return {}
# Splitting text into smaller parts
text_parts = re.split(r"[,;,;、\s & |]+", text)
text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
text_combined = " ".join(text_parts)
try:
doc = nlp_model(text_combined)
entity_counts = Counter([ent.label_ for ent in doc.ents])
word_counts_in_entities = Counter()
for ent in doc.ents:
word_counts_in_entities[ent.label_] += len(ent.text.split())
total_words_in_entities = sum(word_counts_in_entities.values())
total_words = len([token for token in doc if not token.is_punct])
if total_words_in_entities == 0 or total_words == 0:
return None
entity_percentage = total_words_in_entities / total_words
if entity_percentage < 0.5:
return None
most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
entity_percentage = word_count / total_words_in_entities
if entity_percentage >= threshold:
return most_common_entity
else:
return None
except Exception as e:
logger.error(f"Error in entity detection: {e}")
return None
def __main__():
nlpModel = NLPModels()
test_strings = [
"张三",
"张三, 李四,王五; 赵六",
"John Doe",
"Jane Smith",
"Lee, John",
"John Doe, Jane Smith; Alice Johnson,Bob Lee",
"孙七, Michael Jordan;赵八",
"David Smith Michael O'Connor; Kevin ßáçøñ",
"李雷·韩梅梅, 张三·李四",
"Charles Robert Darwin, Isaac Newton",
"莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
"John Doe, Jane Smith; Alice Johnson",
"张三, 李四,王五; 赵六",
"Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
"Rachel Mills & William Barry & Susanne B. Haga",
"Claire Chabut* and Jean-François Bussières",
"1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
"Changchun",
"china",
"Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
"Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
"Synergistic Effect of Supported Nickel Catalyst with",
"Intumescent Flame-Retardants on Flame Retardancy",
"and Thermal Stability of Polypropylene",
]
for test in test_strings:
print()
print(f"Original String: {test}")
result = nlpModel.detect_entity_catgr_using_nlp(test)
print(f"Detected entities: {result}")
if __name__ == "__main__":
__main__()
class ContentType:
Image = 'image'
Table = 'table'
Text = 'text'
InlineEquation = 'inline_equation'
InterlineEquation = 'interline_equation'
class BlockType:
Image = 'image'
ImageBody = 'image_body'
ImageCaption = 'image_caption'
ImageFootnote = 'image_footnote'
Table = 'table'
TableBody = 'table_body'
TableCaption = 'table_caption'
TableFootnote = 'table_footnote'
Text = 'text'
Title = 'title'
InterlineEquation = 'interline_equation'
Footnote = 'footnote'
Discarded = 'discarded'
class CategoryId:
Title = 0
Text = 1
Abandon = 2
ImageBody = 3
ImageCaption = 4
TableBody = 5
TableCaption = 6
TableFootnote = 7
InterlineEquation_Layout = 8
InlineEquation = 13
InterlineEquation_YOLO = 14
OcrText = 15
ImageFootnote = 101
def remove_non_official_s3_args(s3path):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
"""
arr = s3path.split("?")
return arr[0]
def parse_s3path(s3path: str):
# from s3pathlib import S3Path
# p = S3Path(remove_non_official_s3_args(s3path))
# return p.bucket, p.key
s3path = remove_non_official_s3_args(s3path).strip()
if s3path.startswith(('s3://', 's3a://')):
prefix, path = s3path.split('://', 1)
bucket_name, key = path.split('/', 1)
return bucket_name, key
elif s3path.startswith('/'):
raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
else:
raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
def parse_s3_range_params(s3path: str):
"""
example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
"""
arr = s3path.split("?bytes=")
if len(arr) == 1:
return None
return arr[1].split(",")
from io import BytesIO
import re
import fitz
import numpy as np
from loguru import logger
from pdfminer.high_level import extract_text
def calculate_sample_count(total_page: int):
"""
根据总页数和采样率计算采样页面的数量。
"""
select_page_cnt = min(10, total_page)
return select_page_cnt
def extract_pages(src_pdf_bytes: bytes):
pdf_docs = fitz.open("pdf", src_pdf_bytes)
total_page = len(pdf_docs)
if total_page == 0:
# 如果PDF没有页面,直接返回空文档
logger.warning("PDF is empty, return empty document")
return fitz.Document()
select_page_cnt = calculate_sample_count(total_page)
page_num = np.random.choice(total_page, select_page_cnt, replace=False)
sample_docs = fitz.Document()
try:
for index in page_num:
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
except Exception as e:
logger.exception(e)
return sample_docs
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
""""
检测PDF中是否包含非法字符
"""
'''pdfminer比较慢,需要先随机抽取10页左右的sample'''
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
text = extract_text(sample_pdf_file_like_object)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
cid_pattern = re.compile(r'\(cid:\d+\)')
matches = cid_pattern.findall(text)
cid_count = len(matches)
cid_len = sum(len(match) for match in matches)
text_len = len(text)
if text_len == 0:
cid_chars_radio = 0
else:
cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
'''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
if cid_chars_radio > 0.05:
return False # 乱码文档
else:
return True # 正常文档
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
"""
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
"""
# 拼接文件名
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
# 老版本返回不带bucket的路径
img_path = join_path(return_path, filename) if return_path is not None else None
# 新版本生成平铺路径
img_hash256_path = f"{compute_sha256(img_path)}.jpg"
# 将坐标转换为fitz.Rect对象
rect = fitz.Rect(*bbox)
# 配置缩放倍数为3倍
zoom = fitz.Matrix(3, 3)
# 截取图片
pix = page.get_pixmap(clip=rect, matrix=zoom)
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
return img_hash256_path
import os
def sanitize_filename(filename, replacement="_"):
if os.name == 'nt':
invalid_chars = '<>:"|?*'
for char in invalid_chars:
filename = filename.replace(char, replacement)
return filename
import math
def __inc_dict_val(mp, key, val_inc:int):
if mp.get(key):
mp[key] = mp[key] + val_inc
else:
mp[key] = val_inc
def get_text_block_base_info(block):
"""
获取这个文本块里的字体的颜色、字号、字体
按照正文字数最多的返回
"""
counter = {}
for line in block['lines']:
for span in line['spans']:
color = span['color']
size = round(span['size'], 2)
font = span['font']
txt_len = len(span['text'])
__inc_dict_val(counter, (color, size, font), txt_len)
c, s, ft = max(counter, key=counter.get)
return c, s, ft
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment