Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
# network architecture
model: FsmnKWS
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMN
encoder_conf:
input_dim: 400
input_affine_dim: 140
fsmn_layers: 4
linear_dim: 250
proj_dim: 128
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 140
output_dim: 2599
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 5
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 10
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 32000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null
# network architecture
model: FsmnKWS
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMN
encoder_conf:
input_dim: 360
input_affine_dim: 280
fsmn_layers: 4
linear_dim: 280
proj_dim: 200
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 400
output_dim: 2602
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 9
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 10
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 32000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
split_with_space: true
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null
from __future__ import print_function
import argparse
import copy
import logging
import os
from shutil import copyfile
import torch
import yaml
from typing import Union
from funasr.models.fsmn_kws.model import FsmnKWSConvert
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_args():
parser = argparse.ArgumentParser(
description=
'load and convert network to each other between kaldi/pytorch format')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument(
'--network_file',
default='',
required=True,
help='input network, support kaldi.txt/pytorch.pt')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--model_name', required=True, help='save model name')
parser.add_argument('--convert_to',
default='kaldi',
required=True,
help='target network type, kaldi/pytorch')
args = parser.parse_args()
return args
def convert_to_kaldi(
configs,
network_file,
model_dir,
model_name="convert.kaldi.txt"
):
copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
model = FsmnKWSConvert(
vocab_size=configs['encoder_conf']['output_dim'],
encoder='FSMNConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
print(model)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
states= torch.load(network_file, map_location='cpu')
model.load_state_dict(states["state_dict"])
kaldi_text = os.path.join(model_dir, model_name)
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
def convert_to_pytorch(
configs,
network_file,
model_dir,
model_name="convert.torch.pt"
):
model = FsmnKWSConvert(
vocab_size=configs['encoder_conf']['output_dim'],
frontend=None,
specaug=None,
normalize=None,
encoder='FSMNConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
model.to_pytorch_net(network_file)
save_model_path = os.path.join(model_dir, model_name)
torch.save({"model": model.state_dict()}, save_model_path)
print('convert torch format back to kaldi')
kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
print('Done!')
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
print(args)
with open(args.config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
if args.convert_to == 'pytorch':
print('convert kaldi net to pytorch...')
convert_to_pytorch(
configs,
args.network_file,
args.model_dir,
args.model_name
)
elif args.convert_to == 'kaldi':
print('convert pytorch net to kaldi...')
convert_to_kaldi(
configs,
args.network_file,
args.model_dir,
args.model_name
)
else:
print('unsupported target network type: {}'.format(args.convert_to))
if __name__ == '__main__':
main()
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models_kws
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun
if [ ! -d "$local_path" ]; then
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${local_path}
fi
export PATH=${local_path}/runtime:$PATH
export LD_LIBRARY_PATH=${local_path}/runtime:$LD_LIBRARY_PATH
config=./conf/fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
torch_nnet=exp/finetune_outputs/model.pt.avg10
out_dir=exp/finetune_outputs
if [ ! -d "$out_dir" ]; then
mkdir -p $out_dir
fi
python convert.py --config $config --network_file $torch_nnet --model_dir $out_dir --model_name "convert.kaldi.txt" --convert_to kaldi
nnet-copy --binary=true ${out_dir}/convert.kaldi.txt ${out_dir}/convert.kaldi.net
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_charctc_kws_phone-xiaoyun",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu'
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_charctc_kws_phone-xiaoyun"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${model_name_or_model_dir}
fi
config=fsmn_4e_l10r2_250_128_fdim80_t2599.yaml
token_list=${model_name_or_model_dir}/funasr/tokens_2599.txt
lexicon_list=${model_name_or_model_dir}/funasr/lexicon.txt
cmvn_file=${model_name_or_model_dir}/funasr/am.mvn.dim80_l2r2
init_param="${model_name_or_model_dir}/funasr/basetrain_fsmn_4e_l10r2_250_128_fdim80_t2599.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
}&
done
wait
for f in detect; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi
../../../funasr
\ No newline at end of file
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
model="iic/speech_charctc_kws_phone-xiaoyun"
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun.git ${local_path}
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
config="inference_fsmn_4e_l10r2_250_128_fdim80_t2599.yaml"
tokens="${local_path}/funasr/tokens_2599.txt"
seg_dict="${local_path}/funasr/lexicon.txt"
init_param="${local_path}/funasr/finetune_fsmn_4e_l10r2_250_128_fdim80_t2599_xiaoyun_xiaoyun.pt"
cmvn_file="${local_path}/funasr/am.mvn.dim80_l2r2"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python -m funasr.bin.inference \
--config-path "${local_path}/funasr" \
--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH
# network architecture
model: FsmnKWSMT
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMNMT
encoder_conf:
input_dim: 400
input_affine_dim: 140
fsmn_layers: 4
linear_dim: 250
proj_dim: 128
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 140
output_dim: 2599
output_dim2: 4
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 5
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 100
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: KwsMTDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
dataloader: DataloaderMapStyle
tokenizer:
- CharTokenizer
- CharTokenizer
tokenizer_conf:
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null
# network architecture
model: FsmnKWSMT
model_conf:
ctc_weight: 1.0
# encoder related
encoder: FSMNMT
encoder_conf:
input_dim: 360
input_affine_dim: 280
fsmn_layers: 4
linear_dim: 280
proj_dim: 200
lorder: 10
rorder: 2
lstride: 1
rstride: 1
output_affine_dim: 400
output_dim: 2602
output_dim2: 4
use_softmax: false
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 9
lfr_n: 3
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 3
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 100
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: KwsMTDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
dataloader: DataloaderMapStyle
tokenizer:
- CharTokenizer
- CharTokenizer
tokenizer_conf:
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
- unk_symbol: <unk>
split_with_space: true
token_list: null
seg_dict: null
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
extra_linear: false
normalize: null
from __future__ import print_function
import argparse
import copy
import logging
import os
from shutil import copyfile
import torch
import yaml
from typing import Union
from funasr.models.fsmn_kws_mt.encoder import FSMNMTConvert
from funasr.models.fsmn_kws_mt.model import FsmnKWSMTConvert
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_args():
parser = argparse.ArgumentParser(
description=
'load and convert network to each other between kaldi/pytorch format')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument(
'--network_file',
default='',
required=True,
help='input network, support kaldi.txt/pytorch.pt')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--model_name', required=True, help='save model name')
parser.add_argument('--model_name2', required=True, help='save model name')
parser.add_argument('--convert_to',
default='kaldi',
required=True,
help='target network type, kaldi/pytorch')
args = parser.parse_args()
return args
def convert_to_kaldi(
configs,
network_file,
model_dir,
model_name="convert.kaldi.txt",
model_name2="convert.kaldi2.txt"
):
copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
model = FsmnKWSMTConvert(
encoder='FSMNMTConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
print(model)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
states= torch.load(network_file, map_location='cpu')
model.load_state_dict(states["state_dict"])
kaldi_text = os.path.join(model_dir, model_name)
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
kaldi_text2 = os.path.join(model_dir, model_name2)
with open(kaldi_text2, 'w', encoding='utf8') as fout:
nnet_desp2 = model.to_kaldi_net2()
fout.write(nnet_desp2)
fout.close()
def convert_to_pytorch(
configs,
network_file,
model_dir,
model_name="convert.torch.pt"
):
model = FsmnKWSMTConvert(
encoder='FSMNMTConvert',
encoder_conf=configs['encoder_conf'],
ctc_conf=configs['ctc_conf'],
)
num_params = count_parameters(model)
print('the number of model params: {}'.format(num_params))
copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
model.to_pytorch_net(network_file)
save_model_path = os.path.join(model_dir, model_name)
torch.save({"model": model.state_dict()}, save_model_path)
print('convert torch format back to kaldi')
kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
with open(kaldi_text, 'w', encoding='utf8') as fout:
nnet_desp = model.to_kaldi_net()
fout.write(nnet_desp)
fout.close()
print('Done!')
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
print(args)
with open(args.config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
if args.convert_to == 'pytorch':
print('convert kaldi net to pytorch...')
convert_to_pytorch(
configs,
args.network_file,
args.model_dir,
args.model_name,
args.model_name2,
)
elif args.convert_to == 'kaldi':
print('convert pytorch net to kaldi...')
convert_to_kaldi(
configs,
args.network_file,
args.model_dir,
args.model_name
)
else:
print('unsupported target network type: {}'.format(args.convert_to))
if __name__ == '__main__':
main()
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun_mt
if [ ! -d "$local_path" ]; then
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${local_path}
fi
export PATH=${local_path}/runtime:$PATH
export LD_LIBRARY_PATH=${local_path}/runtime:$LD_LIBRARY_PATH
# finetune config file
config=./conf/fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
# finetune output checkpoint
torch_nnet=exp/finetune_outputs/model.pt.avg10
out_dir=exp/finetune_outputs
if [ ! -d "$out_dir" ]; then
mkdir -p $out_dir
fi
python convert.py --config $config \
--network_file $torch_nnet \
--model_dir $out_dir \
--model_name "convert.kaldi.txt" \
--model_name2 "convert.kaldi2.txt" \
--convert_to kaldi
nnet-copy --binary=true ${out_dir}/convert.kaldi.txt ${out_dir}/convert.kaldi.net
nnet-copy --binary=true ${out_dir}/convert.kaldi2.txt ${out_dir}/convert.kaldi2.net
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_charctc_kws_phone-xiaoyun_mt",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu'
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_charctc_kws_phone-xiaoyun_mt"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${model_name_or_model_dir}
fi
config=fsmn_4e_l10r2_250_128_fdim80_t2599_t4.yaml
token_list=${model_name_or_model_dir}/funasr/tokens_2599.txt
token_list2=${model_name_or_model_dir}/funasr/tokens_xiaoyun.txt
lexicon_list=${model_name_or_model_dir}/funasr/lexicon.txt
cmvn_file=${model_name_or_model_dir}/funasr/am.mvn.dim80_l2r2
init_param="${model_name_or_model_dir}/funasr/basetrain_fsmn_4e_l10r2_250_128_fdim80_t2599.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++token_lists='['''${token_list}''', '''${token_list2}''']' \
++seg_dicts='['''${lexicon_list}''', '''${lexicon_list}''']' \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++token_lists='['''${token_list}''', '''${token_list2}''']' \
++seg_dicts='['''${lexicon_list}''', '''${lexicon_list}''']' \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
}&
done
wait
for f in detect detect2; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
mkdir -p ${inference_dir}/task1
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}/task1
mkdir -p ${inference_dir}/task2
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect2 \
--stats_dir ${inference_dir}/task2
done
fi
../../../funasr
\ No newline at end of file
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
model="iic/speech_charctc_kws_phone-xiaoyun_mt"
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_charctc_kws_phone-xiaoyun_mt
git clone https://www.modelscope.cn/iic/speech_charctc_kws_phone-xiaoyun_mt.git ${local_path}
device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
config="inference_fsmn_4e_l10r2_280_200_fdim40_t2602_t4.yaml"
tokens="${local_path}/funasr/tokens_2602.txt"
tokens2="${local_path}/funasr/tokens_xiaoyun.txt"
seg_dict="${local_path}/funasr/lexicon.txt"
init_param="${local_path}/funasr/finetune_fsmn_4e_l10r2_280_200_fdim40_t2602_t4_xiaoyun_xiaoyun.pt"
cmvn_file="${local_path}/funasr/am.mvn.dim40_l4r4"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python -m funasr.bin.inference \
--config-path "${local_path}/funasr" \
--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++token_lists='['''${tokens}''', '''${tokens2}''']' \
++seg_dicts='['''${seg_dict}''', '''${seg_dict}''']' \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment