Commit 2db3c263 authored by myhloli's avatar myhloli
Browse files

refactor(libs): remove unused imports and functions

- Remove unused imports from commons.py
- Delete unused functions related to AWS and S3 operations
- Update import statements in other modules to reflect changes in commons.py
- Remove redundant code and improve code readability
parent e937e011
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置.""" """输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
import sys
from collections import Counter from collections import Counter
import click import fitz
from loguru import logger from loguru import logger
from magic_pdf.config.drop_reason import DropReason from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file from magic_pdf.libs.commons import get_top_percent_list, mymax
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars from magic_pdf.libs.pdf_check import detect_invalid_chars
...@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes): ...@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
return res return res
@click.command()
@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
@click.option('--s3-profile', help='s3上的profile')
def main(s3_pdf_path: str, s3_profile: str):
""""""
try:
file_content = read_file(s3_pdf_path, s3_profile)
pdf_meta_scan(file_content)
except Exception as e:
print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
logger.exception(e)
if __name__ == '__main__': if __name__ == '__main__':
main() pass
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf" # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf" # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf" # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
......
import datetime
import json
import os, re, configparser
import subprocess
import time
import boto3
from loguru import logger
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
import fitz # 1.23.9中已经切换到rebase
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
def get_delta_time(input_time):
return round(time.time() - input_time, 2)
def join_path(*args): def join_path(*args):
return '/'.join(str(s).rstrip('/') for s in args) return '/'.join(str(s).rstrip('/') for s in args)
#配置全局的errlog_path,方便demo同步引用
error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path = "s3://llm-pdf-text/json_dump/"
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
def get_top_percent_list(num_list, percent): def get_top_percent_list(num_list, percent):
""" """
获取列表中前百分之多少的元素 获取列表中前百分之多少的元素
...@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent): ...@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
return top_percent_list return top_percent_list
def formatted_time(time_stamp):
dt_object = datetime.datetime.fromtimestamp(time_stamp)
output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
return output_time
def mymax(alist: list): def mymax(alist: list):
if len(alist) == 0: if len(alist) == 0:
return 0 # 空是0, 0*0也是0大小q return 0 # 空是0, 0*0也是0大小q
else: else:
return max(alist) return max(alist)
def parse_aws_param(profile):
if isinstance(profile, str):
# 解析配置文件
config_file = join_path(os.path.expanduser("~"), ".aws", "config")
credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
config = configparser.ConfigParser()
config.read(credentials_file)
config.read(config_file)
# 获取 AWS 账户相关信息
ak = config.get(profile, "aws_access_key_id")
sk = config.get(profile, "aws_secret_access_key")
if profile == "default":
s3_str = config.get(f"{profile}", "s3")
else:
s3_str = config.get(f"profile {profile}", "s3")
end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
if end_match:
endpoint = end_match.group(1)
else:
raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
if style_match:
addressing_style = style_match.group(1)
else:
addressing_style = "path"
elif isinstance(profile, dict):
ak = profile["ak"]
sk = profile["sk"]
endpoint = profile["endpoint"]
addressing_style = "auto"
return ak, sk, endpoint, addressing_style
def parse_bucket_key(s3_full_path: str): def parse_bucket_key(s3_full_path: str):
""" """
...@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str): ...@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
s3_full_path = s3_full_path[1:] s3_full_path = s3_full_path[1:]
bucket, key = s3_full_path.split("/", 1) bucket, key = s3_full_path.split("/", 1)
return bucket, key return bucket, key
def read_file(pdf_path: str, s3_profile):
if pdf_path.startswith("s3://"):
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
bucket_name, bucket_key = parse_bucket_key(pdf_path)
res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
file_content = res["Body"].read()
return file_content
else:
with open(pdf_path, "rb") as f:
return f.read()
def get_docx_model_output(pdf_model_output, page_id):
model_output_json = pdf_model_output[page_id]
return model_output_json
def list_dir(dir_path:str, s3_profile:str):
"""
列出dir_path下的所有文件
"""
ret = []
if dir_path.startswith("s3"):
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
bucket, path = s3info[0][0], s3info[0][1]
try:
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
config=Config(s3={'addressing_style': addressing_style}))
def list_obj_scluster():
marker = None
while True:
list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
if marker:
list_kwargs['Marker'] = marker
response = cli.list_objects(**list_kwargs)
contents = response.get("Contents", [])
yield from contents
if not response.get("IsTruncated") or len(contents)==0:
break
marker = contents[-1]['Key']
for info in list_obj_scluster():
file_path = info['Key']
#size = info['Size']
if path!="":
afile = file_path[len(path):]
if afile.endswith(".json"):
ret.append(f"s3://{bucket}/{file_path}")
return ret
except Exception as e:
logger.exception(e)
exit(-1)
else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(".json"):
ret.append(join_path(root, file))
ret.sort()
return ret
def get_img_s3_client(save_path:str, image_s3_config:str):
"""
"""
if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
img_s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=end_point,
config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
)
else:
img_s3_client = None
return img_s3_client
if __name__=="__main__":
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
s3_profile = "langchao"
ret = list_dir(s3_path, s3_profile)
print(ret)
\ No newline at end of file
import fitz
from magic_pdf.config.constants import CROSS_PAGE from magic_pdf.config.constants import CROSS_PAGE
from magic_pdf.config.ocr_content_type import (BlockType, CategoryId, from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
ContentType)
from magic_pdf.data.dataset import PymuDocDataset from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
......
from io import BytesIO from io import BytesIO
import cv2 import cv2
import fitz
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from magic_pdf.data.data_reader_writer import DataWriter from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.commons import fitz, join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256 from magic_pdf.libs.hash_utils import compute_sha256
......
import enum import enum
import json
from magic_pdf.config.model_block_type import ModelBlockTypeEnum from magic_pdf.config.model_block_type import ModelBlockTypeEnum
from magic_pdf.config.ocr_content_type import CategoryId, ContentType from magic_pdf.config.ocr_content_type import CategoryId, ContentType
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
from magic_pdf.data.dataset import Dataset from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance, from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
bbox_relative_pos, box_area, calculate_iou, bbox_relative_pos, box_area, calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio, calculate_overlap_area_in_bbox1_area_ratio,
get_overlap_area) get_overlap_area)
from magic_pdf.libs.commons import fitz, join_path
from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.local_math import float_gt from magic_pdf.libs.local_math import float_gt
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
...@@ -1048,29 +1044,3 @@ class MagicModel: ...@@ -1048,29 +1044,3 @@ class MagicModel:
def get_model_list(self, page_no): def get_model_list(self, page_no):
return self.__model_list[page_no] return self.__model_list[page_no]
if __name__ == '__main__':
drw = FileBasedDataReader(r'D:/project/20231108code-clean')
if 0:
pdf_file_path = r'linshixuqiu\19983-00.pdf'
model_file_path = r'linshixuqiu\19983-00_new.json'
pdf_bytes = drw.read(pdf_file_path)
model_json_txt = drw.read(model_file_path).decode()
model_list = json.loads(model_json_txt)
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
img_bucket_path = 'imgs'
img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
pdf_docs = fitz.open('pdf', pdf_bytes)
magic_model = MagicModel(model_list, pdf_docs)
if 1:
from magic_pdf.data.dataset import PymuDocDataset
model_list = json.loads(
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
)
pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
for i in range(7):
print(magic_model.get_imgs(i))
...@@ -5,6 +5,7 @@ import time ...@@ -5,6 +5,7 @@ import time
from typing import List from typing import List
import torch import torch
import fitz
from loguru import logger from loguru import logger
from magic_pdf.config.enums import SupportedPdfParseMethod from magic_pdf.config.enums import SupportedPdfParseMethod
...@@ -12,7 +13,6 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType ...@@ -12,7 +13,6 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.data.dataset import Dataset, PageableData from magic_pdf.data.dataset import Dataset, PageableData
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.clean_memory import clean_memory from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
...@@ -784,7 +784,7 @@ def pdf_parse_union( ...@@ -784,7 +784,7 @@ def pdf_parse_union(
if debug_mode: if debug_mode:
time_now = time.time() time_now = time.time()
logger.info( logger.info(
f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}' f'page_id: {page_id}, last_page_cost_time: {time.time() - start_time}'
) )
start_time = time_now start_time = time_now
......
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path from magic_pdf.libs.commons import parse_bucket_key, join_path
import boto3 import boto3
from loguru import logger from loguru import logger
from botocore.config import Config from botocore.config import Config
......
...@@ -2,10 +2,10 @@ import io ...@@ -2,10 +2,10 @@ import io
import json import json
import os import os
import fitz
import boto3 import boto3
from botocore.config import Config from botocore.config import Config
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.config_reader import get_s3_config_dict from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment