Commit 0941998c authored by sunzhq2's avatar sunzhq2 Committed by xuxo
Browse files

conformer add post and ana

parent fde49a28
...@@ -230,3 +230,5 @@ Start Time: 1770708789.470781 ...@@ -230,3 +230,5 @@ Start Time: 1770708789.470781
End Time: 1770708816.038687 End Time: 1770708816.038687
Start Time: 1770709093.392558 Start Time: 1770709093.392558
End Time: 1770709114.731006 End Time: 1770709114.731006
Start Time: 1770901118.677859
End Time: 1770901139.961724
# 批量转换所有模型 input_dir=/home/sunzhq/workspace/yidong-infer/conformer/onnx_models_1
output_dir=/home/sunzhq/workspace/yidong-infer/conformer/onnx_models_batch24_1
rm -rf ${output_dir}
mkdir -p ${output_dir}
cp -r ${input_dir}/* ${output_dir}
rm -rf ${output_dir}/transformer_lm/full/*
python convert_onnx_batch_size.py \ python convert_onnx_batch_size.py \
--input /home/sunzhq/workspace/yidong-infer/conformer/onnx_models/transformer_lm/full \ --input ${input_dir}/transformer_lm/full \
--output /home/sunzhq/workspace/yidong-infer/conformer/onnx_models_batch24 \ --output ${output_dir}/transformer_lm/full/ \
--batch_size 24 \ --batch_size 24 \
--batch_mode --batch_mode
\ No newline at end of file
# export_asr_onnx.py
import torch
import numpy as np
import soundfile as sf
from espnet2.bin.asr_inference import Speech2Text # Import the main inference class
# from espnet2.utils.fileio import read_kaldi_ascii_vec # Remove this unused import
import argparse
def get_parser():
parser = argparse.ArgumentParser(description='Export ESPNet ASR model to ONNX')
parser.add_argument('--asr_model_file', type=str, required=True, help='Path to the trained ASR model .pth file')
parser.add_argument('--asr_config', type=str, required=True, help='Path to the ASR model config.yaml file')
parser.add_argument('--output_encoder', type=str, default='asr_encoder.onnx', help='Output ONNX file for encoder')
parser.add_argument('--output_decoder', type=str, default='asr_decoder.onnx', help='Output ONNX file for decoder')
return parser
class ONNXEncoder(torch.nn.Module):
def __init__(self, encoder):
super(ONNXEncoder, self).__init__()
self.encoder = encoder
def forward(self, speech, speech_lengths):
# The encoder might return more than just (encoder_out, encoder_out_lens)
# Capture all returned values
encoder_outputs = self.encoder(speech, speech_lengths)
# Typically, the first two are encoder_out and encoder_out_lens
encoder_out, encoder_out_lens = encoder_outputs[0], encoder_outputs[1]
# If there are more outputs (like cache or intermediate results), ignore them for ONNX
return encoder_out, encoder_out_lens
class ONNXDecoder(torch.nn.Module):
def __init__(self, decoder, decoder_output_layer, ctc=None):
super(ONNXDecoder, self).__init__()
self.decoder = decoder
self.decoder_output_layer = decoder_output_layer
self.ctc = ctc
def forward(self, encoder_out, encoder_out_lens, hyp_seq, hyp_len):
"""
encoder_out: (B, T_enc, D_enc)
encoder_out_lens: (B,)
hyp_seq: (B, U) - Padded hypothesis sequence IDs (e.g., <sos>, id1, id2, ...)
hyp_len: (B,) - Actual lengths of hyp_seq
Returns:
ctc_logprobs: (B, T_enc, VocabSize) if CTC exists
att_logprobs: (B, VocabSize) for the *next* token prediction based on the whole hyp_seq
"""
# Prepare masks
batch_size, T_enc = encoder_out.size(0), encoder_out.size(1)
U = hyp_seq.size(1)
# src_mask (mask for encoder output based on encoder_out_lens)
src_mask = (~make_pad_mask(encoder_out_lens, T_enc)).unsqueeze(1).unsqueeze(2).to(encoder_out.device) # (B, 1, 1, T_enc)
# tgt_mask (mask for decoder input based on hyp_len)
tgt_mask = (~make_pad_mask(hyp_len, U)).unsqueeze(1).unsqueeze(2).to(encoder_out.device) # (B, 1, U, U)
# Ensure upper triangular part (future info) is masked in tgt_mask
future_mask = torch.triu(torch.ones(U, U, device=tgt_mask.device), diagonal=1).bool()
tgt_mask = tgt_mask & (~future_mask.unsqueeze(0))
# Forward through decoder
dec_out, _ = self.decoder(hyp_seq, tgt_mask, encoder_out, src_mask) # dec_out: (B, U, D_dec)
# Calculate attention-based log probabilities for the *last* token position in the sequence
last_token_states = dec_out[torch.arange(batch_size), hyp_len - 1] # (B, D_dec)
# Apply the output layer to get logits, then log_softmax
att_logits = self.decoder_output_layer(last_token_states) # (B, VocabSize)
att_logprobs = att_logits.log_softmax(dim=-1) # (B, VocabSize)
# Also return CTC log probs if available
ctc_logprobs = self.ctc.log_softmax(encoder_out) if self.ctc else torch.empty(batch_size, T_enc, 0, device=encoder_out.device, dtype=encoder_out.dtype) # (B, T_enc, VocabSize)
return ctc_logprobs, att_logprobs
def make_pad_mask(lengths, max_len=None):
"""Create padding mask."""
batch_size = lengths.size(0)
if max_len is None:
max_len = lengths.max().item()
seq_range = torch.arange(0, max_len, dtype=torch.int64, device=lengths.device)
seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
seq_length_expand = lengths.unsqueeze(-1).expand_as(seq_range_expand)
mask = seq_range_expand >= seq_length_expand
return mask
def get_nested_attr(obj, attr_path):
"""Get a nested attribute using a dot-separated string path."""
attrs = attr_path.split('.')
current_obj = obj
for attr in attrs:
current_obj = getattr(current_obj, attr)
return current_obj
def main():
parser = get_parser()
args = parser.parse_args()
# Load the speech2text inference object
speech2text = Speech2Text(
asr_train_config=args.asr_config,
asr_model_file=args.asr_model_file,
device="cpu", # or "cuda"
maxlenratio=0.0,
minlenratio=0.0,
batch_size=1,
dtype="float32",
beam_size=1, # Use beam_size=1 for ONNX export to avoid dynamic control flow
ctc_weight=0.5, # or the value from your config (0.3)
lm_weight=0.0, # Disable LM for this export
penalty=0.0,
nbest=1
)
# --- Access the internal ASR model ---
checkpoint = torch.load(args.asr_model_file, map_location="cpu")
from omegaconf import OmegaConf
from espnet2.tasks.asr import ASRTask
config = OmegaConf.load(args.asr_config)
asr_model, *_ = ASRTask.build_model_from_file(
args.asr_config,
args.asr_model_file,
device="cpu" # or "cuda"
)
asr_model.eval()
print("Loaded internal ASR model successfully.")
# print("Model attributes:", [attr for attr in dir(asr_model) if not attr.startswith('_')]) # Commented out as it was verbose
# --- Find the correct output layer name ---
vocab_size = len(config.token_list) # Get vocab size from config
print(f"Expected vocab size: {vocab_size}")
found_layer = False
decoder_output_layer_path = "" # Store the full path
for name, module in asr_model.named_modules():
if isinstance(module, torch.nn.Linear):
if module.out_features == vocab_size:
print(f"Found potential output layer: {name} -> {module}")
decoder_output_layer_path = name # e.g., 'decoder.output_layer'
found_layer = True
break
if not found_layer:
print("Could not find a Linear layer matching the vocab size. Please inspect the model manually.")
# You might need to print the full model structure: print(asr_model)
# Or iterate through children/modules more carefully.
return # Exit if not found
print(f"Using '{decoder_output_layer_path}' as the decoder output layer path.")
# --- Export Encoder ---
encoder_wrapper = ONNXEncoder(asr_model.encoder)
dummy_speech = torch.randn(1, 200, 80, dtype=torch.float32) # (B, T, F) - Adjust T and F!
dummy_speech_lengths = torch.LongTensor([200]) # Actual length in frames
torch.onnx.export(
encoder_wrapper,
(dummy_speech, dummy_speech_lengths),
args.output_encoder,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=['speech', 'speech_lengths'],
output_names=['encoder_out', 'encoder_out_lens'],
dynamic_axes={
'speech': {0: 'batch_size', 1: 'time'},
'speech_lengths': {0: 'batch_size'},
'encoder_out': {0: 'batch_size', 1: 'time_enc'},
'encoder_out_lens': {0: 'batch_size'}
}
)
print(f"Encoder exported to {args.output_encoder}")
# --- Export Decoder (Scoring Part) ---
# Use the helper function to get the nested attribute
target_layer = get_nested_attr(asr_model, decoder_output_layer_path)
decoder_wrapper = ONNXDecoder(asr_model.decoder, target_layer, getattr(asr_model, 'ctc', None))
dummy_encoder_out = torch.randn(1, 100, asr_model.encoder.output_size(), dtype=torch.float32) # (B, T_enc, D_enc)
dummy_encoder_out_lens = torch.LongTensor([100])
sos_id_val = asr_model.sos
dummy_hyp_seq = torch.full((1, 5), 0, dtype=torch.long) # Initialize with padding ID (often 0)
dummy_hyp_seq[0, 0] = sos_id_val # <sos>
dummy_hyp_seq[0, 1] = 100 # Some dummy token ID
dummy_hyp_seq[0, 2] = 200 # Another dummy token ID
dummy_hyp_len = torch.LongTensor([3]) # Length of the actual sequence (excluding padding)
dyn_axes_dec = {
'encoder_out': {0: 'batch_size', 1: 'time_enc'},
'encoder_out_lens': {0: 'batch_size'},
'hyp_seq': {0: 'batch_size', 1: 'time_dec'},
'hyp_len': {0: 'batch_size'},
'ctc_logprobs': {0: 'batch_size', 1: 'time_enc'}, # Only dynamic if CTC is used
'att_logprobs': {0: 'batch_size'}
}
torch.onnx.export(
decoder_wrapper,
(dummy_encoder_out, dummy_encoder_out_lens, dummy_hyp_seq, dummy_hyp_len),
args.output_decoder,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=['encoder_out', 'encoder_out_lens', 'hyp_seq', 'hyp_len'],
output_names=['ctc_logprobs', 'att_logprobs'], # Outputs for scoring
dynamic_axes=dyn_axes_dec
)
print(f"Decoder (scoring part) exported to {args.output_decoder}")
if __name__ == "__main__":
main()
\ No newline at end of file
# python export_asr_onnx.py \
# --asr_model_file exp/asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp/valid.acc.ave_10best.pth \
# --asr_config exp/asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp/config.yaml \
# --output_encoder ./onnx_models/asr_encoder.onnx \
# --output_decoder ./onnx_models/asr_decoder.onnx \
# --stats_file exp/asr_stats_raw_sp/train/feats_stats.npz
python export_asr_onnx.py \
--asr_model_file exp/asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp/valid.acc.ave_10best.pth \
--asr_config exp/asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp/config.yaml \
--output_encoder ./onnx_models/asr_encoder.onnx \
--output_decoder ./onnx_models/asr_decoder.onnx
# --stats_file exp/asr_stats_raw_sp/train/feats_stats.npz # Removed from script, add back if needed later
\ No newline at end of file
# export_lm_onnx.py
import torch
from espnet2.tasks.lm import LMTask
from omegaconf import OmegaConf
import argparse
def get_parser():
parser = argparse.ArgumentParser(description='Export ESPNet LM model to ONNX')
parser.add_argument('--lm_file', type=str, required=True, help='Path to the trained LM .pth file')
parser.add_argument('--lm_config', type=str, required=True, help='Path to the LM config.yaml file')
parser.add_argument('--output', type=str, default='lm.onnx', help='Output ONNX file for LM')
return parser
class ONNXLMLayer(torch.nn.Module):
def __init__(self, lm_model):
super(ONNXLMLayer, self).__init__()
# The core LM network is usually the 'predictor'
# Check the specific structure of your LM model by printing model.children()/modules()
self.predictor = lm_model.predictor # This is the Transformer LM part
def forward(self, input_ids, lengths):
# input_ids: (B, T) - token IDs
# lengths: (B,) - actual sequence lengths
# The LM outputs logits for the next token
# y shape is typically (B, T, vocab_size)
y, _ = self.predictor(input_ids, lengths)
# Return logits for all positions
return y # Shape: (B, T, vocab_size)
def main():
parser = get_parser()
args = parser.parse_args()
# Load config
config = OmegaConf.load(args.lm_config)
# Build model directly using ESPNet's task interface
model, *_ = LMTask.build_model_from_file(
args.lm_config,
args.lm_file,
device="cpu" # or "cuda"
)
model.eval()
# Wrap the core LM predictor module
predictor_wrapper = ONNXLMLayer(model)
# Example input shapes
# input_ids: (B, T) - token IDs
# lengths: (B,) - actual lengths
# Get vocab size from the model's output layer
vocab_size = model.predictor.embed.out_features if hasattr(model.predictor.embed, 'out_features') else model.predictor.output_layer.out_features
print(f"Detected LM vocab size: {vocab_size}")
dummy_input_ids = torch.randint(low=0, high=vocab_size, size=(1, 10)) # Batch=1, Seq Len=10
dummy_lengths = torch.LongTensor([10])
torch.onnx.export(
predictor_wrapper,
(dummy_input_ids, dummy_lengths),
args.output,
export_params=True,
opset_version=14,
do_constant_folding=True,
input_names=['input_ids', 'lengths'],
output_names=['logits'],
dynamic_axes={
'input_ids': {0: 'batch_size', 1: 'sequence_length'},
'lengths': {0: 'batch_size'},
'logits': {0: 'batch_size', 1: 'sequence_length', 2: 'vocab_size'}
}
)
print(f"LM exported to {args.output}")
if __name__ == "__main__":
main()
\ No newline at end of file
python export_lm_onnx.py \
--lm_file exp/lm_train_lm_transformer_char_batch_bins2000000/valid.loss.ave_10best.pth \
--lm_config exp/lm_train_lm_transformer_char_batch_bins2000000/config.yaml \
--output lm.onnx
\ No newline at end of file
transformer_lm: /home/sunzhq/workspace/yidong-infer/conformer/onnx_models_1/transformer_lm
This diff is collapsed.
transformer_lm: /home/sunzhq/workspace/yidong-infer/conformer/onnx_models_1/transformer_lm
This diff is collapsed.
...@@ -2,12 +2,10 @@ python -m espnet_onnx.export \ ...@@ -2,12 +2,10 @@ python -m espnet_onnx.export \
--model_type asr \ --model_type asr \
--input ./asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp_valid.acc.ave.zip \ --input ./asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp_valid.acc.ave.zip \
--tag transformer_lm \ --tag transformer_lm \
--output /home/sunzhq/workspace/yidong-infer/conformer/onnx_models \ --output /home/sunzhq/workspace/yidong-infer/conformer/onnx_models_1 \
--max_seq_len 2048 \ --apply_optimize \
--apply_optimize --max_seq_len 2048
# --apply_quantize # --batch_size 24
--batch_size 24 \
# --model_type {asr,tts} # --model_type {asr,tts}
# task type # task type
# --input INPUT path to the zip file. # --input INPUT path to the zip file.
......
import onnxruntime as ort
import numpy as np
# 测试加载每个模型
models = {
"encoder": "/root/.cache/espnet_onnx/transformer_lm/full/default_encoder.onnx",
"decoder": "/root/.cache/espnet_onnx/transformer_lm/full/xformer_decoder.onnx",
"ctc": "/root/.cache/espnet_onnx/transformer_lm/full/ctc.onnx",
"lm": "/root/.cache/espnet_onnx/transformer_lm/full/transformer_lm.onnx"
}
# /root/.cache/espnet_onnx/transformer_lm/full
for name, path in models.items():
try:
# 创建推理会话
session = ort.InferenceSession(path)
# 获取输入信息
inputs = session.get_inputs()
print(f"\n{name} 模型加载成功")
print(f" 输入: {[i.name for i in inputs]}")
print(f" 形状: {[i.shape for i in inputs]}")
except Exception as e:
print(f"\n{name} 模型加载失败: {e}")
\ No newline at end of file
#!/usr/bin/bash
# if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
asr_train_config="/home/sunzhq/workspace/yidong-infer/conformer/34e9cabc2c29fd0e3a2917ffa525d98b/exp/asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp/config.yaml"
asr_model_file="/home/sunzhq/workspace/yidong-infer/conformer/34e9cabc2c29fd0e3a2917ffa525d98b/exp/asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp/valid.acc.ave_10best.pth"
lm_train_config=/home/sunzhq/workspace/yidong-infer/conformer/34e9cabc2c29fd0e3a2917ffa525d98b/exp/lm_train_lm_transformer_char_batch_bins2000000/config.yaml
lm_path=/home/sunzhq/workspace/yidong-infer/conformer/34e9cabc2c29fd0e3a2917ffa525d98b/exp/lm_train_lm_transformer_char_batch_bins2000000/valid.loss.ave_10best.pth
manifest="/home/sunzhq/workspace/yidong-infer/conformer/torch-infer/test"
mkdir -p logs
# mode='attention_rescoring'
mode='lm_rescoring'
# num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
num_gpus=1
for ((i = 0; i < $num_gpus; ++i)); do
{
# gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
gpu_id=0
numactl -N 0 -m 0 python3 infer.py \
--config $asr_train_config \
--model_path $asr_model_file \
--lm_config $lm_train_config \
--lm_path $lm_path \
--gpu $gpu_id \
--wav_scp $manifest/wav.scp --text $manifest/text \
--result_file ./logs/predictions_${mode}_$gpu_id.txt \
--log_file ./logs/log_${mode}_$gpu_id.txt \
--batch_size 24 --beam_size 10 \
--mode $mode || exit 1
python3 2.py --char=1 --v=1 \
$manifest/text ./logs/predictions_${mode}_$gpu_id.txt > ./logs/wer_${mode}_$gpu_id || exit 1
cat ./logs/wer_${mode}_0 | grep "Overall"
cat ./logs/log_${mode}_0.txt | grep RTF
} &
done
wait
# fi
# gpu_all=$[`nvidia-smi -L |wc -l` - 1]
gpu_all=1
# grep RTF logs/log_lm_rescoring_[0-${gpu_all}].txt |awk 'BEGIN{sum=0;line=0}{sum+=$11;line++}END{print "total throughput for " line " gpus:" sum "audio_sample_len/time_end"}'>./logs/results.txt
# grep Overall logs/wer_lm_rescoring_[0-${gpu_all}] |awk 'BEGIN{sum=0;line=0}{sum+=$3;line++}END{ans=sum/line;print "avg wer: " ans "%" }' >>./logs/results.txt
grep RTF logs/log_lm_rescoring_0.txt |awk 'BEGIN{sum=0;line=0}{sum+=$11;line++}END{print "total throughput for " line " gpus:" sum "audio_sample_len/time_end"}'>./logs/results.txt
grep Overall logs/wer_lm_rescoring_0 |awk 'BEGIN{sum=0;line=0}{sum+=$3;line++}END{ans=sum/line;print "avg wer: " ans "%" }' >>./logs/results.txt
This diff is collapsed.
maxtime=`cat result_*.log |grep "^total_inf" |awk 'BEGIN{s=0}{if(s<$2) s=$2}END{print s}'`
#inftime=`cat result_*.log |grep "^total_inf" |awk '{s+=$2}END{print s, s/NR}'`
inffps=`cat result_*.log |grep "^avg_infer_fps" |awk '{s+=$2}END{print s, s/NR}'`
loadtime=`cat result_*.log |grep "^load_data_total" |awk 'BEGIN{s=0}{if(s<$2) s=$2}END{print s}'`
loadfps=`cat result_*.log |grep "^load_data_avg" |awk '{s+=$2}END{print s, s/NR}'`
echo "max infer time: $maxtime"
echo "Average infer fps: $inffps"
echo "max load time: $loadtime"
echo "Average load fps: $loadfps"
This diff is collapsed.
...@@ -72,6 +72,6 @@ def convert_to_fp16_with_transformers(input_path, output_path): ...@@ -72,6 +72,6 @@ def convert_to_fp16_with_transformers(input_path, output_path):
return output_path return output_path
# 使用 # 使用
input_path = "/home/sunzhq/workspace/yidong-infer/conformer/onnx_models_batch24/transformer_lm/full/default_encoder.onnx" input_path = "/home/sunzhq/workspace/yidong-infer/conformer/onnx_models_batch24_1/transformer_lm/full/default_encoder.onnx"
output_path = input_path.replace('.onnx', '_fp16.onnx') output_path = input_path.replace('.onnx', '_fp16.onnx')
convert_to_fp16_with_transformers(input_path, output_path) convert_to_fp16_with_transformers(input_path, output_path)
\ No newline at end of file
<!-- Generated by scripts/utils/show_asr_result.sh -->
# RESULTS
## Environments
- date: `Mon Oct 19 13:56:23 JST 2020`
- python version: `3.7.3 (default, Mar 27 2019, 22:11:17) [GCC 7.3.0]`
- espnet version: `espnet 0.9.0`
- pytorch version: `pytorch 1.6.0`
- Git hash: `20b0c89369d9dd3e05780b65fdd00a9b4f4891e5`
- Commit date: `Mon Oct 12 09:28:20 2020 -0400`
## asr_train_asr_conformer3_raw_char_batch_bins4000000_accum_grad4_sp
### WER
|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_rnn_lm_lm_train_lm_char_valid.loss.ave_asr_model_valid.acc.ave/dev|14326|14326|64.8|35.2|0.0|0.0|35.2|35.2|
|decode_asr_rnn_lm_lm_train_lm_char_valid.loss.ave_asr_model_valid.acc.ave/test|7176|7176|63.5|36.5|0.0|0.0|36.5|36.5|
|decode_asr_rnn_lm_lm_train_lm_transformer_char_batch_bins2000000_valid.loss.ave_asr_model_valid.acc.ave/dev|14326|14326|66.3|33.7|0.0|0.0|33.7|33.7|
|decode_asr_rnn_lm_lm_train_lm_transformer_char_batch_bins2000000_valid.loss.ave_asr_model_valid.acc.ave/test|7176|7176|65.0|35.0|0.0|0.0|35.0|35.0|
### CER
|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_rnn_lm_lm_train_lm_char_valid.loss.ave_asr_model_valid.acc.ave/dev|14326|205341|95.5|4.4|0.1|0.1|4.6|35.2|
|decode_asr_rnn_lm_lm_train_lm_char_valid.loss.ave_asr_model_valid.acc.ave/test|7176|104765|95.2|4.7|0.1|0.1|4.9|36.5|
|decode_asr_rnn_lm_lm_train_lm_transformer_char_batch_bins2000000_valid.loss.ave_asr_model_valid.acc.ave/dev|14326|205341|95.7|4.2|0.1|0.1|4.4|33.7|
|decode_asr_rnn_lm_lm_train_lm_transformer_char_batch_bins2000000_valid.loss.ave_asr_model_valid.acc.ave/test|7176|104765|95.4|4.5|0.1|0.1|4.7|35.0|
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment