Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online",
)
res = model.export(type="onnx", quantize=False)
print(res)
# # method2, inference from local path
# from funasr import AutoModel
#
#
# model = AutoModel(
# model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
# )
#
# res = model.export(type="onnx", quantize=False)
# print(res)
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
export HYDRA_FULL_ERROR=1
model="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
python -m funasr.bin.export \
++model=${model} \
++type="onnx" \
++quantize=false \
++device="cpu"
## method2, inference from local path
#model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
#
#python -m funasr.bin.export \
#++model=${model} \
#++type="onnx" \
#++quantize=false \
#++device="cpu" \
#++debug=false
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
workspace=`pwd`
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically
model_name_or_model_dir="iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
## option 2, download model by git
#local_path_root=${workspace}/modelscope_models
#mkdir -p ${local_path_root}/${model_name_or_model_dir}
#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
# exp output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
deepspeed_config=${workspace}/../../ds_stage1.json
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
DISTRIBUTED_ARGS="
--nnodes ${WORLD_SIZE:-1} \
--nproc_per_node $gpu_num \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-26669}
"
echo $DISTRIBUTED_ARGS
torchrun $DISTRIBUTED_ARGS \
../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDataset" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000 \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
\ No newline at end of file
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(model="Qwen-Audio")
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
res = model.generate(input=audio_in, prompt=prompt)
print(res)
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(model="Qwen/Qwen-Audio-Chat")
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
# 1st dialogue turn
prompt = "what does the person say?"
cache = {"history": None}
res = model.generate(input=audio_in, prompt=prompt, cache=cache)
print(res)
# 2nd dialogue turn
prompt = 'Find the start time and end time of the word "middle classes"'
res = model.generate(input=None, prompt=prompt, cache=cache)
print(res)
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(
model="Qwen-Audio-Chat",
model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio-Chat",
)
audio_in = (
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
)
# 1st dialogue turn
prompt = "what does the person say?"
cache = {"history": None}
res = model.generate(input=audio_in, prompt=prompt, cache=cache)
print(res)
# 2nd dialogue turn
prompt = 'Find the start time and end time of the word "middle classes"'
res = model.generate(input=None, prompt=prompt, cache=cache)
print(res)
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# To install requirements: pip3 install -U "funasr[llm]"
from funasr import AutoModel
model = AutoModel(
model="Qwen-Audio",
model_path="/nfs/zhifu.gzf/init_model/qwen/Qwen-Audio",
)
audio_in = "https://github.com/QwenLM/Qwen-Audio/raw/main/assets/audio/1272-128104-0000.flac"
prompt = "<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>"
res = model.generate(input=audio_in, prompt=prompt)
print(res)
# network architecture
model: SanmKWS
model_conf:
ctc_weight: 1.0
# encoder
encoder: SANMEncoder
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 320 # the number of units of position-wise feed forward
num_blocks: 6 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
# frontend related
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 20
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 10000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 96000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
normalize: null
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_sanm_kws_phone-xiaoyun-commands-offline",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu'
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws/conf"
config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws/exp/20240914_xiaoyun_finetune_sanm_6e_320_256_feats_dim40_char_t2602_offline"
config_file="sanm_6e_320_256_fdim40_t2602.yaml"
config_file="config.yaml"
model_path="./modelscope_models_kws/speech_charctc_kws_phone-xiaoyun/funasr/finetune_sanm_6e_320_256_fdim40_t2602_online_xiaoyun_commands.pt"
python -m funasr.bin.export \
--config-path="${config_path}" \
--config-name="${config_file}" \
++init_param=${model_path} \
++type="onnx" \
++quantize=true
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=3
inference_device="cpu" #"cpu"
inference_device="cuda" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${model_name_or_model_dir}
fi
config=sanm_6e_320_256_fdim40_t2602.yaml
token_list=${model_name_or_model_dir}/tokens_2602.txt
lexicon_list=${model_name_or_model_dir}/lexicon.txt
cmvn_file=${model_name_or_model_dir}/am.mvn.dim40_l3r3
init_param="${model_name_or_model_dir}/basetrain_sanm_6e_320_256_fdim40_t2602_offline.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
# ++batch_size="${inference_batch_size}"
}&
done
wait
for f in detect score; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi
../../../funasr
\ No newline at end of file
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method1, inference from model hub
model="iic/speech_sanm_kws_phone-xiaoyun-commands-offline"
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
+device="cpu" \
++keywords="\"$keywords_string"\"
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
# method2, inference from local model
# for more input type, please ref to readme.md
input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
output_dir="./outputs/debug"
workspace=`pwd`
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_sanm_kws_phone-xiaoyun-commands-offline
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-offline.git ${local_path}
device="cpu" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
config="inference_sanm_6e_320_256_fdim40_t2602_offline.yaml"
tokens="${local_path}/tokens_2602.txt"
seg_dict="${local_path}/lexicon.txt"
init_param="${local_path}/finetune_sanm_6e_320_256_fdim40_t2602_offline_xiaoyun_commands.pt"
cmvn_file="${local_path}/am.mvn.dim40_l3r3"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
python -m funasr.bin.inference \
--config-path "${local_path}/" \
--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \
++keywords="\"$keywords_string"\"
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH
# network architecture
model: SanmKWSStreaming
model_conf:
ctc_weight: 1.0
# encoder
encoder: SANMEncoderChunkOpt
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 320 # the number of units of position-wise feed forward
num_blocks: 6 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe_online
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
chunk_size:
- 16
- 20
stride:
- 8
- 10
pad_left:
- 4
- 5
encoder_att_look_back_factor:
- 0
- 0
decoder_att_look_back_factor:
- 0
- 0
# frontend related
frontend: WavFrontendOnline
frontend_conf:
fs: 16000
window: hamming
n_mels: 40
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 100
keep_nbest_models: 20
avg_nbest_model: 10
avg_keep_nbest_models_type: loss
validate_interval: 50000
save_checkpoint_interval: 50000
avg_checkpoint_interval: 1000
log_interval: 50
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 30000
dataset: AudioDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
batch_type: length # example or length
batch_size: 64000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
max_token_length: 1600 # filter samples if source_token_len+target_token_len > max_token_length,
buffer_size: 2048
shuffle: true
num_workers: 8
preprocessor_speech: SpeechPreprocessSpeedPerturb
preprocessor_speech_conf:
speed_perturb: [0.9, 1.0, 1.1]
tokenizer: CharTokenizer
tokenizer_conf:
unk_symbol: <unk>
ctc_conf:
dropout_rate: 0.0
ctc_type: builtin # ctc_type: focalctc, builtin
reduce: true
ignore_nan_grad: true
normalize: null
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
from funasr import AutoModel
model = AutoModel(
model="iic/speech_sanm_kws_phone-xiaoyun-commands-online",
keywords="小云小云",
output_dir="./outputs/debug",
device='cpu',
chunk_size=[4, 8, 4],
encoder_chunk_look_back=0,
decoder_chunk_look_back=0,
)
test_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
res = model.generate(input=test_wav, cache={},)
print(res)
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws_streaming/conf"
config_path="/home/pengteng.spt/source/FunASR_KWS/examples/industrial_data_pretraining/sanm_kws_streaming/exp/20240618_xiaoyun_finetune_sanm_6e_320_256_feats_dim40_char_t2602_online_6"
config_file="sanm_6e_320_256_fdim40_t2602.yaml"
config_file="config.yaml"
model_path="./modelscope_models_kws/speech_charctc_kws_phone-xiaoyun/funasr/finetune_sanm_6e_320_256_fdim40_t2602_online_xiaoyun_commands.pt"
python -m funasr.bin.export \
--config-path="${config_path}" \
--config-name="${config_file}" \
++init_param=${model_path} \
++type="onnx" \
++quantize=true
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
. ./path.sh
workspace=`pwd`
CUDA_VISIBLE_DEVICES="0,1"
stage=2
stop_stage=4
inference_device="cpu" #"cpu"
inference_checkpoint="model.pt.avg10"
inference_scp="wav.scp"
inference_batch_size=32
nj=32
test_sets="test"
# model_name from model_hub, or model_dir in local path
## option 1, download model automatically, unsupported currently
model_name_or_model_dir="iic/speech_sanm_kws_phone-xiaoyun-commands-online"
## option 2, download model by git
local_path_root=${workspace}/modelscope_models
model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
if [ ! -d $model_name_or_model_dir ]; then
mkdir -p ${model_name_or_model_dir}
git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-online.git ${model_name_or_model_dir}
fi
config=sanm_6e_320_256_fdim40_t2602.yaml
token_list=${model_name_or_model_dir}/tokens_2602.txt
lexicon_list=${model_name_or_model_dir}/lexicon.txt
cmvn_file=${model_name_or_model_dir}/am.mvn.dim40_l3r3
init_param="${model_name_or_model_dir}/basetrain_sanm_6e_320_256_fdim40_t2602_online.pt"
# data prepare
# data dir, which contains: train.json, val.json
data_dir=../../data
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Generate audio json list"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/train_wav.scp''', '''${data_dir}/train_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python $FUNASR_DIR/funasr/datasets/audio_datasets/scp2jsonl.py \
++scp_file_list='['''${data_dir}/val_wav.scp''', '''${data_dir}/val_text.txt''']' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
fi
# exp output dir
output_dir="${workspace}/exp/finetune_outputs"
# Training Stage
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: KWS Training"
mkdir -p ${output_dir}
current_time=$(date "+%Y-%m-%d_%H-%M")
log_file="${output_dir}/train.log.txt.${current_time}"
echo "log_file: ${log_file}"
echo "finetune use basetrain model: ${init_param}"
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${workspace}/conf" \
--config-name "${config}" \
++init_param="${init_param}" \
++disable_update=true \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++output_dir="${output_dir}" &> ${log_file}
fi
# Testing Stage
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Inference chunk_size: [4, 8, 4]"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}/chunk-4-8-4_elb-0_dlb_0"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++chunk_size='[4, 8, 4]' \
++encoder_chunk_look_back=0 \
++decoder_chunk_look_back=0 \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
}&
done
wait
for f in detect score; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi
# Testing Stage
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "stage 4: Inference chunk_size: [5, 10, 5]"
keywords=(小云小云)
keywords_string=$(IFS=,; echo "${keywords[*]}")
echo "keywords: $keywords_string"
if [ ${inference_device} == "cuda" ]; then
nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
else
inference_batch_size=1
CUDA_VISIBLE_DEVICES=""
for JOB in $(seq ${nj}); do
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1,"
done
fi
for dset in ${test_sets}; do
inference_dir="${output_dir}/inference-${inference_checkpoint}/${dset}/chunk-5-10-5_elb-0_dlb_0"
_logdir="${inference_dir}/logdir"
echo "inference_dir: ${inference_dir}"
mkdir -p "${_logdir}"
test_data_dir="${data_dir}/${dset}"
key_file=${test_data_dir}/${inference_scp}
split_scps=
for JOB in $(seq "${nj}"); do
split_scps+=" ${_logdir}/keys.${JOB}.scp"
done
$FUNASR_DIR/examples/aishell/paraformer/utils/split_scp.pl "${key_file}" ${split_scps}
gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ })
for JOB in $(seq ${nj}); do
{
id=$((JOB-1))
gpuid=${gpuid_list_array[$id]}
echo "${output_dir}"
export CUDA_VISIBLE_DEVICES=${gpuid}
python ../../../funasr/bin/inference.py \
--config-path="${output_dir}" \
--config-name="config.yaml" \
++init_param="${output_dir}/${inference_checkpoint}" \
++tokenizer_conf.token_list="${token_list}" \
++tokenizer_conf.seg_dict="${lexicon_list}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++keywords="\"$keywords_string"\" \
++input="${_logdir}/keys.${JOB}.scp" \
++output_dir="${inference_dir}/${JOB}" \
++chunk_size='[5, 10, 5]' \
++encoder_chunk_look_back=0 \
++decoder_chunk_look_back=0 \
++device="${inference_device}" \
++ncpu=1 \
++disable_log=true \
++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt
}&
done
wait
for f in detect; do
if [ -f "${inference_dir}/${JOB}/${f}" ]; then
for JOB in $(seq "${nj}"); do
cat "${inference_dir}/${JOB}/${f}"
done | sort -k1 >"${inference_dir}/${f}"
fi
done
python funasr/utils/compute_det_ctc.py \
--keywords ${keywords_string} \
--test_data ${test_data_dir}/wav.scp \
--trans_data ${test_data_dir}/text \
--score_file ${inference_dir}/detect \
--stats_dir ${inference_dir}
done
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment