initial commit

70a8a9e0 · wangwei990215 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0
Commit 70a8a9e0 authored Oct 03, 2024 by wangwei990215
20 changed files
--- a/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/funasr
+++ b/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/funasr
+../../../funasr
\ No newline at end of file
--- a/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/infer.sh
+++ b/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/infer.sh
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method1, inference from model hub
+
+model="iic/speech_sanm_kws_phone-xiaoyun-commands-online"
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
++chunk_size='[4, 8, 4]' \
++encoder_chunk_look_back=0 \
++decoder_chunk_look_back=0 \
+device="cpu" \
++keywords="\"$keywords_string"\"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input=${input} \
+output_dir="./outputs/debug" \
++chunk_size='[5, 10, 5]' \
++encoder_chunk_look_back=0 \
++decoder_chunk_look_back=0 \
+device="cpu" \
++keywords="\"$keywords_string"\"
--- a/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/infer_from_local.sh
+++ b/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/infer_from_local.sh
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# method2, inference from local model
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/KWS/pos_testset/kws_xiaoyunxiaoyun.wav"
+
+output_dir="./outputs/debug"
+
+workspace=`pwd`
+
+# download model
+local_path_root=${workspace}/modelscope_models
+mkdir -p ${local_path_root}
+local_path=${local_path_root}/speech_sanm_kws_phone-xiaoyun-commands-online
+git clone https://www.modelscope.cn/iic/speech_sanm_kws_phone-xiaoyun-commands-online.git ${local_path}
+
+device="cpu" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+config="inference_sanm_6e_320_256_fdim40_t2602_online.yaml"
+tokens="${local_path}/tokens_2602.txt"
+seg_dict="${local_path}/lexicon.txt"
+init_param="${local_path}/finetune_sanm_6e_320_256_fdim40_t2602_online_xiaoyun_commands.pt"
+cmvn_file="${local_path}/am.mvn.dim40_l3r3"
+
+keywords=(小云小云)
+keywords_string=$(IFS=,; echo "${keywords[*]}")
+echo "keywords: $keywords_string"
+
+echo "inference sanm streaming with chunk_size=[4, 8, 4]"
+python -m funasr.bin.inference \
+--config-path "${local_path}/" \
+--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++chunk_size='[4, 8, 4]' \
++encoder_chunk_look_back=0 \
++decoder_chunk_look_back=0 \
++device="${device}" \
++keywords="\"$keywords_string"\"
+
+
+echo "inference sanm streaming with chunk_size=[5, 10, 5]"
+python -m funasr.bin.inference \
+--config-path "${local_path}/" \
+--config-name "${config}" \
++init_param="${init_param}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++tokenizer_conf.token_list="${tokens}" \
++tokenizer_conf.seg_dict="${seg_dict}" \
++input="${input}" \
++output_dir="${output_dir}" \
++chunk_size='[5, 10, 5]' \
++encoder_chunk_look_back=0 \
++decoder_chunk_look_back=0 \
++device="${device}" \
++keywords="\"$keywords_string"\"
--- a/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/path.sh
+++ b/FunASR/examples/industrial_data_pretraining/sanm_kws_streaming/path.sh
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
--- a/FunASR/examples/industrial_data_pretraining/scama/demo.py
+++ b/FunASR/examples/industrial_data_pretraining/scama/demo.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+chunk_size = [5, 10, 5]  # [0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 0  # number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 0  # number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(
+    model="/Users/zhifu/Downloads/modelscope_models/speech_SCAMA_asr-zh-cn-16k-common-vocab8358-streaming"
+)
+cache = {}
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    chunk_size=chunk_size,
+    encoder_chunk_look_back=encoder_chunk_look_back,
+    decoder_chunk_look_back=decoder_chunk_look_back,
+)
+print(res)
+
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+
+chunk_stride = chunk_size[1] * 960  # 600ms、480ms
+
+cache = {}
+total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i * chunk_stride : (i + 1) * chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(
+        input=speech_chunk,
+        cache=cache,
+        is_final=is_final,
+        chunk_size=chunk_size,
+        encoder_chunk_look_back=encoder_chunk_look_back,
+        decoder_chunk_look_back=decoder_chunk_look_back,
+    )
+    print(res)
--- a/FunASR/examples/industrial_data_pretraining/scama/demo.sh
+++ b/FunASR/examples/industrial_data_pretraining/scama/demo.sh
+
+model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+
--- a/FunASR/examples/industrial_data_pretraining/seaco_paraformer/demo.py
+++ b/FunASR/examples/industrial_data_pretraining/seaco_paraformer/demo.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    # vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    # punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    # spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
+)
+
+
+# example1
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    hotword="达摩院 魔搭",
+    # return_raw_text=True,     # return raw text recognition results splited by space of equal length with timestamp
+    # preset_spk_num=2,         # preset speaker num for speaker cluster model
+    # sentence_timestamp=True,  # return sentence level information when spk_model is not given
+)
+print(res)
+
+
+"""
+# tensor or numpy as input
+# example2
+import torchaudio
+import os
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+input_tensor, sample_rate = torchaudio.load(wav_file)
+input_tensor = input_tensor.mean(0)
+res = model.generate(input=[input_tensor], batch_size_s=300, is_final=True)
+
+
+# example3
+import soundfile
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+res = model.generate(input=[speech], batch_size_s=300, is_final=True)
+"""
--- a/FunASR/examples/industrial_data_pretraining/seaco_paraformer/demo.sh
+++ b/FunASR/examples/industrial_data_pretraining/seaco_paraformer/demo.sh
+
+model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+vad_model_revision="master"
+punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+punc_model_revision="master"
+
+python funasr/bin/inference.py \
+model=${model} \
+vad_model=${vad_model} \
+vad_model_revision=${vad_model_revision} \
+punc_model=${punc_model} \
+punc_model_revision=${punc_model_revision} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+"hotword='达摩院 魔搭'"
--- a/FunASR/examples/industrial_data_pretraining/seaco_paraformer/finetune.sh
+++ b/FunASR/examples/industrial_data_pretraining/seaco_paraformer/finetune.sh
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+workspace=`pwd`
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically
+model_name_or_model_dir="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+## option 2, download model by git
+#local_path_root=${workspace}/modelscope_models
+#mkdir -p ${local_path_root}/${model_name_or_model_dir}
+#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
+#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+
+
+# data dir, which contains: train.json, val.json
+data_dir="../../../data/list"
+
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
+
+# generate train.jsonl and val.jsonl from wav.scp and text.txt
+scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
+
+scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
+
+
+# exp output dir
+output_dir="./outputs"
+log_file="${output_dir}/log.txt"
+
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+torchrun $DISTRIBUTED_ARGS \
+../../../funasr/bin/train_ds.py \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset="AudioDatasetHotword" \
++dataset_conf.index_ds="IndexDSJsonl" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.avg_keep_nbest_models_type='loss' \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
\ No newline at end of file
--- a/FunASR/examples/industrial_data_pretraining/sense_voice/demo.py
+++ b/FunASR/examples/industrial_data_pretraining/sense_voice/demo.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cuda:0",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# zh
+res = model.generate(
+    input=f"{model.model_path}/example/zh.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# yue
+res = model.generate(
+    input=f"{model.model_path}/example/yue.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# ja
+res = model.generate(
+    input=f"{model.model_path}/example/ja.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+
+# ko
+res = model.generate(
+    input=f"{model.model_path}/example/ko.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
--- a/FunASR/examples/industrial_data_pretraining/sense_voice/demo_libtorch.py
+++ b/FunASR/examples/industrial_data_pretraining/sense_voice/demo_libtorch.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from pathlib import Path
+from funasr_torch import SenseVoiceSmall
+from funasr_torch.utils.postprocess_utils import rich_transcription_postprocess
+
+
+model_dir = "iic/SenseVoiceSmall"
+
+model = SenseVoiceSmall(model_dir, batch_size=10, device="cuda:0")
+
+wav_or_scp = ["{}/.cache/modelscope/hub/{}/example/en.mp3".format(Path.home(), model_dir)]
+
+res = model(wav_or_scp, language="auto", use_itn=True)
+print([rich_transcription_postprocess(i) for i in res])
--- a/FunASR/examples/industrial_data_pretraining/sense_voice/demo_onnx.py
+++ b/FunASR/examples/industrial_data_pretraining/sense_voice/demo_onnx.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from pathlib import Path
+from funasr_onnx import SenseVoiceSmall
+from funasr_onnx.utils.postprocess_utils import rich_transcription_postprocess
+
+
+model_dir = "iic/SenseVoiceSmall"
+
+model = SenseVoiceSmall(model_dir, batch_size=10, quantize=True)
+
+# inference
+wav_or_scp = ["{}/.cache/modelscope/hub/{}/example/en.mp3".format(Path.home(), model_dir)]
+
+res = model(wav_or_scp, language="auto", use_itn=True)
+print([rich_transcription_postprocess(i) for i in res])
--- a/FunASR/examples/industrial_data_pretraining/sense_voice/export.py
+++ b/FunASR/examples/industrial_data_pretraining/sense_voice/export.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+
+model_dir = "iic/SenseVoiceSmall"
+model = AutoModel(
+    model=model_dir,
+    device="cuda:0",
+)
+
+res = model.export(type="onnx", quantize=False)
\ No newline at end of file
--- a/FunASR/examples/industrial_data_pretraining/sense_voice/finetune.sh
+++ b/FunASR/examples/industrial_data_pretraining/sense_voice/finetune.sh
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+workspace=`pwd`
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# model_name from model_hub, or model_dir in local path
+
+## option 1, download model automatically
+model_name_or_model_dir="iic/SenseVoiceCTC"
+
+## option 2, download model by git
+#local_path_root=${workspace}/modelscope_models
+#mkdir -p ${local_path_root}/${model_name_or_model_dir}
+#git clone https://www.modelscope.cn/${model_name_or_model_dir}.git ${local_path_root}/${model_name_or_model_dir}
+#model_name_or_model_dir=${local_path_root}/${model_name_or_model_dir}
+
+
+# data dir, which contains: train.json, val.json
+train_data=${workspace}/data/train_example.jsonl
+val_data=${workspace}/data/val_example.jsonl
+
+# exp output dir
+output_dir="./outputs"
+log_file="${output_dir}/log.txt"
+
+deepspeed_config=${workspace}/../../ds_stage1.json
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+DISTRIBUTED_ARGS="
+    --nnodes ${WORLD_SIZE:-1} \
+    --nproc_per_node $gpu_num \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-26669}
+"
+
+echo $DISTRIBUTED_ARGS
+
+# funasr trainer path
+train_tool=`dirname $(which funasr)`/train_ds.py
+
+torchrun $DISTRIBUTED_ARGS \
+${train_tool} \
++model="${model_name_or_model_dir}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.data_split_num=1 \
++dataset_conf.batch_sampler="BatchSampler" \
++dataset_conf.batch_size=6000  \
++dataset_conf.sort_size=1024 \
++dataset_conf.batch_type="token" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=50 \
++train_conf.log_interval=1 \
++train_conf.resume=true \
++train_conf.validate_interval=2000 \
++train_conf.save_checkpoint_interval=2000 \
++train_conf.keep_nbest_models=20 \
++train_conf.avg_nbest_model=10 \
++train_conf.use_deepspeed=false \
++train_conf.deepspeed_config=${deepspeed_config} \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
\ No newline at end of file
--- a/FunASR/examples/industrial_data_pretraining/transducer/demo.py
+++ b/FunASR/examples/industrial_data_pretraining/transducer/demo.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+# Transducer, BAT and RWKV_BAT models are just same to use, use the correct model_revision
+# https://modelscope.cn/models?name=transducer&page=1&tasks=auto-speech-recognition&type=audio
+model = AutoModel(
+    model="iic/speech_bat_asr-zh-cn-16k-aishell1-vocab4234-pytorch",
+)
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+)
+print(res)
--- a/FunASR/examples/industrial_data_pretraining/uniasr/demo.py
+++ b/FunASR/examples/industrial_data_pretraining/uniasr/demo.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+from funasr import AutoModel
+
+
+model = AutoModel(
+    model="iic/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline",
+)
+
+
+res = model.generate(
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+)
+print(res)
+
+
+""" can not use currently
+from funasr import AutoFrontend
+
+frontend = AutoFrontend(model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+
+fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)
+
+for batch_idx, fbank_dict in enumerate(fbanks):
+    res = model.generate(**fbank_dict)
+    print(res)
+"""
--- a/FunASR/examples/industrial_data_pretraining/uniasr/demo.sh
+++ b/FunASR/examples/industrial_data_pretraining/uniasr/demo.sh
+
+model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+
+
+python funasr/bin/inference.py \
+model=${model} \
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+
--- a/FunASR/examples/industrial_data_pretraining/whisper/demo.py
+++ b/FunASR/examples/industrial_data_pretraining/whisper/demo.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U openai-whisper
+
+from funasr import AutoModel
+
+model = AutoModel(
+    model="iic/Whisper-large-v3",
+    vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    vad_kwargs={"max_single_segment_time": 30000},
+)
+
+DecodingOptions = {
+    "task": "transcribe",
+    "language": None,
+    "beam_size": None,
+    "fp16": True,
+    "without_timestamps": False,
+    "prompt": None,
+}
+res = model.generate(
+    DecodingOptions=DecodingOptions,
+    batch_size_s=0,
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+)
+
+print(res)
--- a/FunASR/examples/industrial_data_pretraining/whisper/demo_from_openai.py
+++ b/FunASR/examples/industrial_data_pretraining/whisper/demo_from_openai.py
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U openai-whisper
+
+from funasr import AutoModel
+
+# model = AutoModel(model="Whisper-small", hub="openai")
+# model = AutoModel(model="Whisper-medium", hub="openai")
+# model = AutoModel(model="Whisper-large-v2", hub="openai")
+model = AutoModel(
+    model="Whisper-large-v3",
+    vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    vad_kwargs={"max_single_segment_time": 30000},
+    hub="openai",
+)
+
+DecodingOptions = {
+    "task": "transcribe",
+    "language": None,
+    "beam_size": None,
+    "fp16": True,
+    "without_timestamps": False,
+    "prompt": None,
+}
+res = model.generate(
+    DecodingOptions=DecodingOptions,
+    batch_size_s=0,
+    input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+)
+
+print(res)
--- a/FunASR/examples/industrial_data_pretraining/whisper/infer.sh
+++ b/FunASR/examples/industrial_data_pretraining/whisper/infer.sh
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+# To install requirements: pip3 install -U openai-whisper
+
+# method1, inference from model hub
+
+# for more input type, please ref to readme.md
+input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
+
+output_dir="./outputs/debug"
+
+model="iic/speech_whisper-large_asr_multilingual"
+
+
+device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
+
+python -m funasr.bin.inference \
++model=${model} \
++input="${input}" \
++output_dir="${output_dir}" \
++device="${device}" \