Commit e4e4eef1 authored by myhloli's avatar myhloli
Browse files

perf(language_detection): optimize batch size for language detection model

- Increase batch size from 8 to 256 for language detection inference
- Add timing measurement for language detection process
parent a5342950
......@@ -4,6 +4,7 @@ from loguru import logger
import re
from io import BytesIO
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
def calculate_sample_count(total_page: int):
......@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
sample_docs = extract_pages(src_pdf_bytes)
sample_pdf_bytes = sample_docs.tobytes()
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
text = extract_text(sample_pdf_file_like_object)
laparams = LAParams(
line_overlap=0.5,
char_margin=2.0,
line_margin=0.5,
word_margin=0.1,
boxes_flow=None,
detect_vertical=False,
all_texts=False,
)
text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
text = text.replace("\n", "")
# logger.info(text)
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
......
# Copyright (c) Opendatalab. All rights reserved.
import time
from collections import Counter
from uuid import uuid4
......@@ -102,9 +103,9 @@ class YOLOv11LangDetModel(object):
temp_images = split_images(image)
for temp_image in temp_images:
all_images.append(resize_images_to_224(temp_image))
images_lang_res = self.batch_predict(all_images, batch_size=8)
# logger.info(f"images_lang_res: {images_lang_res}")
# langdetect_start = time.time()
images_lang_res = self.batch_predict(all_images, batch_size=256)
# logger.info(f"image number of langdetect: {len(images_lang_res)}, langdetect time: {round(time.time() - langdetect_start, 2)}")
if len(images_lang_res) > 0:
count_dict = Counter(images_lang_res)
language = max(count_dict, key=count_dict.get)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment