"tests/vscode:/vscode.git/clone" did not exist on "64c467e2f56cbec31e38e3067a853abefcf16bbc"
Commit e516cf53 authored by myhloli's avatar myhloli
Browse files

feat(performance): add performance monitoring and optimization

- Add performance_stats module to measure and print execution time statistics
- Implement measure_time decorator to track execution time of key functions
- Remove multi-threading in pdf parsing for better resource management
- Optimize pdf parsing logic for improved performance
parent 6ec440d6
import time
import functools
from collections import defaultdict
from typing import Dict, List
class PerformanceStats:
"""性能统计类,用于收集和展示方法执行时间"""
_stats: Dict[str, List[float]] = defaultdict(list)
@classmethod
def add_execution_time(cls, func_name: str, execution_time: float):
"""添加执行时间记录"""
cls._stats[func_name].append(execution_time)
@classmethod
def get_stats(cls) -> Dict[str, dict]:
"""获取统计结果"""
results = {}
for func_name, times in cls._stats.items():
results[func_name] = {
'count': len(times),
'total_time': sum(times),
'avg_time': sum(times) / len(times),
'min_time': min(times),
'max_time': max(times)
}
return results
@classmethod
def print_stats(cls):
"""打印统计结果"""
stats = cls.get_stats()
print("\n性能统计结果:")
print("-" * 80)
print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
print("-" * 80)
for func_name, data in stats.items():
print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
def measure_time(func):
"""测量方法执行时间的装饰器"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
execution_time = time.time() - start_time
PerformanceStats.add_execution_time(func.__name__, execution_time)
return result
return wrapper
\ No newline at end of file
...@@ -21,6 +21,7 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l ...@@ -21,6 +21,7 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
...@@ -217,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float: ...@@ -217,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float:
# logger.info(f"contrast: {contrast}") # logger.info(f"contrast: {contrast}")
return round(contrast, 2) return round(contrast, 2)
@measure_time
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
# cid用0xfffd表示,连字符拆开 # cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
...@@ -491,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h): ...@@ -491,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
else: else:
return [[x0, y0, x1, y1]] return [[x0, y0, x1, y1]]
@measure_time
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height): def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list = [] page_line_list = []
...@@ -925,7 +926,6 @@ def pdf_parse_union( ...@@ -925,7 +926,6 @@ def pdf_parse_union(
magic_model = MagicModel(model_list, dataset) magic_model = MagicModel(model_list, dataset)
"""根据输入的起始范围解析pdf""" """根据输入的起始范围解析pdf"""
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
end_page_id = ( end_page_id = (
end_page_id end_page_id
if end_page_id is not None and end_page_id >= 0 if end_page_id is not None and end_page_id >= 0
...@@ -939,33 +939,16 @@ def pdf_parse_union( ...@@ -939,33 +939,16 @@ def pdf_parse_union(
"""初始化启动时间""" """初始化启动时间"""
start_time = time.time() start_time = time.time()
# for page_id, page in enumerate(dataset): for page_id, page in enumerate(dataset):
# """debug时输出每页解析的耗时.""" """debug时输出每页解析的耗时."""
# if debug_mode:
# time_now = time.time()
# logger.info(
# f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
# )
# start_time = time_now
#
# """解析pdf中的每一页"""
# if start_page_id <= page_id <= end_page_id:
# page_info = parse_page_core(
# page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
# )
# else:
# page_info = page.get_page_info()
# page_w = page_info.w
# page_h = page_info.h
# page_info = ocr_construct_page_component_v2(
# [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
# )
# pdf_info_dict[f'page_{page_id}'] = page_info
def process_page(page_id, page, dataset_len, start_page_id, end_page_id, magic_model, pdf_bytes_md5, imageWriter,
parse_mode, lang, debug_mode, start_time):
if debug_mode: if debug_mode:
time_now = time.time() time_now = time.time()
logger.info(
f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
)
start_time = time_now
"""解析pdf中的每一页"""
if start_page_id <= page_id <= end_page_id: if start_page_id <= page_id <= end_page_id:
page_info = parse_page_core( page_info = parse_page_core(
page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang page, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode, lang
...@@ -977,44 +960,15 @@ def pdf_parse_union( ...@@ -977,44 +960,15 @@ def pdf_parse_union(
page_info = ocr_construct_page_component_v2( page_info = ocr_construct_page_component_v2(
[], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page' [], [], page_id, page_w, page_h, [], [], [], [], [], True, 'skip page'
) )
return page_id, page_info
# Use max_workers based on CPU count but limit to avoid excessive resource usage
max_workers = 2
pdf_info_dict = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {
executor.submit(
process_page,
page_id,
page,
len(dataset),
start_page_id,
end_page_id,
magic_model,
pdf_bytes_md5,
imageWriter,
parse_mode,
lang,
debug_mode,
time.time()
): page_id
for page_id, page in enumerate(dataset)
}
for page_id in range(len(dataset)):
future = [f for f in futures if futures[f] == page_id][0]
try:
page_id, page_info = future.result()
pdf_info_dict[f'page_{page_id}'] = page_info pdf_info_dict[f'page_{page_id}'] = page_info
except Exception as e:
logger.exception(f"Error processing page {page_id}: {e}")
logger.info( logger.info(
f'page_process_time: {round(time.time() - start_time, 2)}' f'page_process_time: {round(time.time() - start_time, 2)}'
) )
PerformanceStats.print_stats()
"""分段""" """分段"""
para_split(pdf_info_dict) para_split(pdf_info_dict)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment