"git@developer.sourcefind.cn:change/sglang.git" did not exist on "01c99a9959e06205ec58a440a29023878967ecc0"
Unverified Commit a65d6b53 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #1112 from myhloli/dev

refactor(libs): remove unused imports and functions
parents b6931171 2db3c263
"""输入: s3路径,每行一个 输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置."""
import sys
from collections import Counter
import click
import fitz
from loguru import logger
from magic_pdf.config.drop_reason import DropReason
from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
from magic_pdf.libs.commons import get_top_percent_list, mymax
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.pdf_check import detect_invalid_chars
......@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
return res
@click.command()
@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
@click.option('--s3-profile', help='s3上的profile')
def main(s3_pdf_path: str, s3_profile: str):
""""""
try:
file_content = read_file(s3_pdf_path, s3_profile)
pdf_meta_scan(file_content)
except Exception as e:
print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
logger.exception(e)
if __name__ == '__main__':
main()
pass
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
# "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
......
import datetime
import json
import os, re, configparser
import subprocess
import time
import boto3
from loguru import logger
from boto3.s3.transfer import TransferConfig
from botocore.config import Config
import fitz # 1.23.9中已经切换到rebase
# import fitz_old as fitz # 使用1.23.9之前的pymupdf库
def get_delta_time(input_time):
return round(time.time() - input_time, 2)
def join_path(*args):
return '/'.join(str(s).rstrip('/') for s in args)
#配置全局的errlog_path,方便demo同步引用
error_log_path = "s3://llm-pdf-text/err_logs/"
# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
json_dump_path = "s3://llm-pdf-text/json_dump/"
# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义
def get_top_percent_list(num_list, percent):
"""
获取列表中前百分之多少的元素
......@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
return top_percent_list
def formatted_time(time_stamp):
dt_object = datetime.datetime.fromtimestamp(time_stamp)
output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
return output_time
def mymax(alist: list):
if len(alist) == 0:
return 0 # 空是0, 0*0也是0大小q
else:
return max(alist)
def parse_aws_param(profile):
if isinstance(profile, str):
# 解析配置文件
config_file = join_path(os.path.expanduser("~"), ".aws", "config")
credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
config = configparser.ConfigParser()
config.read(credentials_file)
config.read(config_file)
# 获取 AWS 账户相关信息
ak = config.get(profile, "aws_access_key_id")
sk = config.get(profile, "aws_secret_access_key")
if profile == "default":
s3_str = config.get(f"{profile}", "s3")
else:
s3_str = config.get(f"profile {profile}", "s3")
end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
if end_match:
endpoint = end_match.group(1)
else:
raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
if style_match:
addressing_style = style_match.group(1)
else:
addressing_style = "path"
elif isinstance(profile, dict):
ak = profile["ak"]
sk = profile["sk"]
endpoint = profile["endpoint"]
addressing_style = "auto"
return ak, sk, endpoint, addressing_style
def parse_bucket_key(s3_full_path: str):
"""
......@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
s3_full_path = s3_full_path[1:]
bucket, key = s3_full_path.split("/", 1)
return bucket, key
def read_file(pdf_path: str, s3_profile):
if pdf_path.startswith("s3://"):
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
bucket_name, bucket_key = parse_bucket_key(pdf_path)
res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
file_content = res["Body"].read()
return file_content
else:
with open(pdf_path, "rb") as f:
return f.read()
def get_docx_model_output(pdf_model_output, page_id):
model_output_json = pdf_model_output[page_id]
return model_output_json
def list_dir(dir_path:str, s3_profile:str):
"""
列出dir_path下的所有文件
"""
ret = []
if dir_path.startswith("s3"):
ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
bucket, path = s3info[0][0], s3info[0][1]
try:
cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
config=Config(s3={'addressing_style': addressing_style}))
def list_obj_scluster():
marker = None
while True:
list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
if marker:
list_kwargs['Marker'] = marker
response = cli.list_objects(**list_kwargs)
contents = response.get("Contents", [])
yield from contents
if not response.get("IsTruncated") or len(contents)==0:
break
marker = contents[-1]['Key']
for info in list_obj_scluster():
file_path = info['Key']
#size = info['Size']
if path!="":
afile = file_path[len(path):]
if afile.endswith(".json"):
ret.append(f"s3://{bucket}/{file_path}")
return ret
except Exception as e:
logger.exception(e)
exit(-1)
else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(".json"):
ret.append(join_path(root, file))
ret.sort()
return ret
def get_img_s3_client(save_path:str, image_s3_config:str):
"""
"""
if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client
ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
img_s3_client = boto3.client(
service_name="s3",
aws_access_key_id=ak,
aws_secret_access_key=sk,
endpoint_url=end_point,
config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
)
else:
img_s3_client = None
return img_s3_client
if __name__=="__main__":
s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
s3_profile = "langchao"
ret = list_dir(s3_path, s3_profile)
print(ret)
\ No newline at end of file
import fitz
from magic_pdf.config.constants import CROSS_PAGE
from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
ContentType)
from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.libs.commons import fitz # PyMuPDF
from magic_pdf.model.magic_model import MagicModel
......
from io import BytesIO
import cv2
import fitz
import numpy as np
from PIL import Image
from magic_pdf.data.data_reader_writer import DataWriter
from magic_pdf.libs.commons import fitz, join_path
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.hash_utils import compute_sha256
......
import enum
import json
from magic_pdf.config.model_block_type import ModelBlockTypeEnum
from magic_pdf.config.ocr_content_type import CategoryId, ContentType
from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
FileBasedDataWriter)
from magic_pdf.data.dataset import Dataset
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
bbox_relative_pos, box_area, calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio,
get_overlap_area)
from magic_pdf.libs.commons import fitz, join_path
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.local_math import float_gt
from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
......@@ -1048,29 +1044,3 @@ class MagicModel:
def get_model_list(self, page_no):
return self.__model_list[page_no]
if __name__ == '__main__':
drw = FileBasedDataReader(r'D:/project/20231108code-clean')
if 0:
pdf_file_path = r'linshixuqiu\19983-00.pdf'
model_file_path = r'linshixuqiu\19983-00_new.json'
pdf_bytes = drw.read(pdf_file_path)
model_json_txt = drw.read(model_file_path).decode()
model_list = json.loads(model_json_txt)
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
img_bucket_path = 'imgs'
img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
pdf_docs = fitz.open('pdf', pdf_bytes)
magic_model = MagicModel(model_list, pdf_docs)
if 1:
from magic_pdf.data.dataset import PymuDocDataset
model_list = json.loads(
drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
)
pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
for i in range(7):
print(magic_model.get_imgs(i))
......@@ -5,6 +5,7 @@ import time
from typing import List
import torch
import fitz
from loguru import logger
from magic_pdf.config.enums import SupportedPdfParseMethod
......@@ -12,7 +13,6 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
from magic_pdf.data.dataset import Dataset, PageableData
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.commons import fitz, get_delta_time
from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5
......@@ -784,7 +784,7 @@ def pdf_parse_union(
if debug_mode:
time_now = time.time()
logger.info(
f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
f'page_id: {page_id}, last_page_cost_time: {time.time() - start_time}'
)
start_time = time_now
......
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
from magic_pdf.libs.commons import parse_bucket_key, join_path
import boto3
from loguru import logger
from botocore.config import Config
......
......@@ -2,10 +2,10 @@ import io
import json
import os
import fitz
import boto3
from botocore.config import Config
from magic_pdf.libs.commons import fitz
from magic_pdf.libs.config_reader import get_s3_config_dict
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment