infer.py

# import librosa
# from espnet_onnx import Speech2Text
# import numpy as np

# PROVIDERS = ['CPUExecutionProvider']
# tag_name = 'transformer_lm'

# speech2text = Speech2Text(
#     providers=PROVIDERS,
#     model_dir="/home/sunzhq/workspace/yidong-infer/conformer/onnx_models/transformer_lm"
# )

# # 加载音频
# audio_path = '/data/datasets/1/data_aishell/wav/test/S0916/BAC009S0916W0314.wav'
# y, sr = librosa.load(audio_path, sr=16000)

# print(f"原始音频长度: {len(y)} 采样点, {len(y)/sr:.2f} 秒")

# # 根据错误信息，索引778超出范围，对应约512帧
# # 假设帧移10ms，512帧对应5.12秒音频
# max_seconds = 4  # 根据你的测试调整
# max_samples = int(max_seconds * sr)

# # 如果音频太长，裁剪
# if len(y) > max_samples:
#     print(f"音频过长，裁剪到前{max_seconds}秒 ({max_samples} 采样点)")
#     y = y[:max_samples]

# # 现在尝试推理
# try:
#     nbest = speech2text(y)
#     # import pdb;pdb.set_trace()
#     print(f"识别结果: {nbest[0][0]}")
# except Exception as e:
#     print(f"错误: {e}")


import librosa
from espnet_onnx import Speech2Text
import numpy as np

def process_long_audio(audio_path, model_dir, max_chunk_seconds=10, sr=16000):
    """处理长音频"""
    # 加载模型
    speech2text = Speech2Text(
        providers=['ROCMExecutionProvider'],
        model_dir=model_dir
    )
    
    # 加载音频
    y, sr = librosa.load(audio_path, sr=sr)
    print(f"音频总长: {len(y)/sr:.2f}秒 ({len(y)}采样点)")
    
    # 尝试整段处理
    try:
        nbest = speech2text(y)
        print(f"整段识别成功: {nbest[0][0]}")
        return nbest[0][0]
    except Exception as e:
        print(f"整段处理失败，开始分块处理: {e}")
    
    # 确定最大安全长度
    max_samples = find_max_safe_length(speech2text, sr, max_chunk_seconds)
    
    # 分块处理
    results = []
    overlap = int(1 * sr)  # 300ms重叠
    
    for start in range(0, len(y), max_samples - overlap):
        end = min(start + max_samples, len(y))
        chunk = y[start:end]
        
        # 跳过太短的块
        if len(chunk) < 0.5 * sr:
            continue
            
        duration = len(chunk) / sr
        print(f"处理 {start/sr:.1f}s-{end/sr:.1f}s ({duration:.1f}秒)...")
        
        try:
            nbest = speech2text(chunk)
            if nbest and nbest[0]:
                results.append(nbest[0][0])
        except Exception as e:
            print(f"块处理失败: {e}")
            # 尝试更小的块
            sub_results = process_with_smaller_chunks(chunk, speech2text, sr)
            results.extend(sub_results)
    
    # 合并结果（简单合并，可根据需要添加更智能的合并逻辑）
    full_text = " ".join(results)
    
    # 后处理：移除重复的词语
    full_text = post_process_text(full_text)
    
    return full_text

def find_max_safe_length(model, sr, initial_max=10):
    """通过二分查找确定最大安全长度"""
    max_samples = int(initial_max * sr)
    min_samples = int(1 * sr)
    
    # 使用测试音频（静音或简单语音）
    test_audio = np.random.randn(max_samples) * 0.01  # 低音量噪音
    
    for test_len in range(max_samples, min_samples, -int(0.5*sr)):
        try:
            model(test_audio[:test_len])
            print(f"安全长度: {test_len/sr:.1f}秒")
            return test_len
        except:
            continue
    
    return min_samples

def process_with_smaller_chunks(audio, model, sr, chunk_size=5):
    """使用更小的块处理音频"""
    results = []
    chunk_samples = int(chunk_size * sr)
    
    for i in range(0, len(audio), chunk_samples):
        chunk = audio[i:i+chunk_samples]
        if len(chunk) > 0.5 * sr:
            try:
                nbest = model(chunk)
                if nbest and nbest[0]:
                    results.append(nbest[0][0])
            except:
                pass
    return results

def post_process_text(text):
    """后处理文本，移除可能的重复"""
    # 简单的重复词移除（可根据需要增强）
    words = text.split()
    cleaned = []
    for i, word in enumerate(words):
        if i == 0 or word != words[i-1]:
            cleaned.append(word)
    return " ".join(cleaned)

# 使用示例
audio_path = '/data/datasets/1/data_aishell/wav/test/S0768/BAC009S0768W0452.wav'
model_dir = "/home/sunzhq/workspace/yidong-infer/conformer/onnx_models/transformer_lm"

result = process_long_audio(audio_path, model_dir)
print(f"\n最终识别结果: {result}")