"backend/apps/vscode:/vscode.git/clone" did not exist on "e31cf0818850225142bdaf67fb72d88c73f89df9"
Commit 39ac40a9 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2747 failed with stages
in 0 seconds
[build-system]
requires = ["setuptools", "wheel", "cython==0.29.35", "numpy==1.24.3", "packaging"]
[tool.black]
line-length = 120
target-version = ['py310']
exclude = '''
(
/(
\.eggs # exclude a few common directories in the
| \.git # root of the project
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
)/
| foo.py # also separately exclude a file named foo.py in
# the root of the project
)
'''
[tool.pytest.ini_options]
addopts = [
"--color=yes",
"--durations=0",
"--strict-markers",
"--doctest-modules",
]
filterwarnings = [
"ignore::DeprecationWarning",
"ignore::UserWarning",
]
log_cli = "True"
markers = [
"slow: slow tests",
]
minversion = "6.0"
testpaths = "tests/"
[tool.coverage.report]
exclude_lines = [
"pragma: nocover",
"raise NotImplementedError",
"raise NotImplementedError()",
"if __name__ == .__main__.:",
]
# --------- pytorch --------- #
torch>=2.0.0
torchvision>=0.15.0
lightning>=2.0.0
torchmetrics>=0.11.4
# --------- hydra --------- #
hydra-core==1.3.2
hydra-colorlog==1.2.0
hydra-optuna-sweeper==1.2.0
# --------- loggers --------- #
# wandb
# neptune-client
# mlflow
# comet-ml
# aim>=3.16.2 # no lower than 3.16.2, see https://github.com/aimhubio/aim/issues/2550
# --------- others --------- #
rootutils # standardizing the project root setup
pre-commit # hooks for applying linters on commit
rich # beautiful text formatting in terminal
pytest # tests
# sh # for running bash commands in some tests (linux/macos only)
phonemizer # phonemization of text
tensorboard
librosa
Cython
numpy
einops
inflect
Unidecode
scipy
torchaudio
matplotlib
pandas
conformer==0.3.2
diffusers # developed using version ==0.25.0
notebook
ipywidgets
gradio==3.43.2
gdown
wget
seaborn
#!/bin/bash
# Schedule execution of many runs
# Run from root folder with: bash scripts/schedule.sh
python src/train.py trainer.max_epochs=5 logger=csv
python src/train.py trainer.max_epochs=10 logger=csv
#!/usr/bin/env python
import os
import numpy
from Cython.Build import cythonize
from setuptools import Extension, find_packages, setup
exts = [
Extension(
name="matcha.utils.monotonic_align.core",
sources=["matcha/utils/monotonic_align/core.pyx"],
)
]
with open("README.md", encoding="utf-8") as readme_file:
README = readme_file.read()
cwd = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(cwd, "matcha", "VERSION"), encoding="utf-8") as fin:
version = fin.read().strip()
def get_requires():
requirements = os.path.join(os.path.dirname(__file__), "requirements.txt")
with open(requirements, encoding="utf-8") as reqfile:
return [str(r).strip() for r in reqfile]
setup(
name="matcha-tts",
version=version,
description="🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching",
long_description=README,
long_description_content_type="text/markdown",
author="Shivam Mehta",
author_email="shivam.mehta25@gmail.com",
url="https://shivammehta25.github.io/Matcha-TTS",
install_requires=get_requires(),
include_dirs=[numpy.get_include()],
include_package_data=True,
packages=find_packages(exclude=["tests", "tests/*", "examples", "examples/*"]),
# use this to customize global commands available in the terminal after installing the package
entry_points={
"console_scripts": [
"matcha-data-stats=matcha.utils.generate_data_statistics:main",
"matcha-tts=matcha.cli:cli",
"matcha-tts-app=matcha.app:main",
"matcha-tts-get-durations=matcha.utils.get_durations_from_trained_model:main",
]
},
ext_modules=cythonize(exts, language_level=3),
python_requires=">=3.9.0",
)
This source diff could not be displayed because it is too large. You can view the blob instead.
import json
import os.path
import tempfile
import sys
import re
import uuid
import requests
from argparse import ArgumentParser
import torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder
sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")
from speech_tokenizer.utils import extract_speech_token
import gradio as gr
import torch
audio_token_pattern = re.compile(r"<\|audio_(\d+)\|>")
from flow_inference import AudioDecoder
from audio_process import AudioStreamProcessor
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default="8888")
parser.add_argument("--flow-path", type=str, default="./glm-4-voice-decoder")
parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b")
parser.add_argument("--tokenizer-path", type= str, default="THUDM/glm-4-voice-tokenizer")
args = parser.parse_args()
flow_config = os.path.join(args.flow_path, "config.yaml")
flow_checkpoint = os.path.join(args.flow_path, 'flow.pt')
hift_checkpoint = os.path.join(args.flow_path, 'hift.pt')
glm_tokenizer = None
device = "cuda"
audio_decoder: AudioDecoder = None
whisper_model, feature_extractor = None, None
def initialize_fn():
global audio_decoder, feature_extractor, whisper_model, glm_model, glm_tokenizer
if audio_decoder is not None:
return
# GLM
glm_tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
# Flow & Hift
audio_decoder = AudioDecoder(config_path=flow_config, flow_ckpt_path=flow_checkpoint,
hift_ckpt_path=hift_checkpoint,
device=device)
# Speech tokenizer
whisper_model = WhisperVQEncoder.from_pretrained(args.tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(args.tokenizer_path)
def clear_fn():
return [], [], '', '', '', None, None
def inference_fn(
temperature: float,
top_p: float,
max_new_token: int,
input_mode,
audio_path: str | None,
input_text: str | None,
history: list[dict],
previous_input_tokens: str,
previous_completion_tokens: str,
):
if input_mode == "audio":
assert audio_path is not None
history.append({"role": "user", "content": {"path": audio_path}})
audio_tokens = extract_speech_token(
whisper_model, feature_extractor, [audio_path]
)[0]
if len(audio_tokens) == 0:
raise gr.Error("No audio tokens extracted")
audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
user_input = audio_tokens
system_prompt = "User will provide you with a speech instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens. "
else:
assert input_text is not None
history.append({"role": "user", "content": input_text})
user_input = input_text
system_prompt = "User will provide you with a text instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens."
# Gather history
inputs = previous_input_tokens + previous_completion_tokens
inputs = inputs.strip()
if "<|system|>" not in inputs:
inputs += f"<|system|>\n{system_prompt}"
inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"
with torch.no_grad():
response = requests.post(
"http://localhost:10000/generate_stream",
data=json.dumps({
"prompt": inputs,
"temperature": temperature,
"top_p": top_p,
"max_new_tokens": max_new_token,
}),
stream=True
)
text_tokens, audio_tokens = [], []
audio_offset = glm_tokenizer.convert_tokens_to_ids('<|audio_0|>')
end_token_id = glm_tokenizer.convert_tokens_to_ids('<|user|>')
complete_tokens = []
prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
this_uuid = str(uuid.uuid4())
tts_speechs = []
tts_mels = []
prev_mel = None
is_finalize = False
block_size_list = [25,50,100,150,200]
block_size_idx = 0
block_size = block_size_list[block_size_idx]
audio_processor = AudioStreamProcessor()
for chunk in response.iter_lines():
token_id = json.loads(chunk)["token_id"]
if token_id == end_token_id:
is_finalize = True
if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
if block_size_idx < len(block_size_list) - 1:
block_size_idx += 1
block_size = block_size_list[block_size_idx]
tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)
if prev_mel is not None:
prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)
tts_speech, tts_mel = audio_decoder.token2wav(tts_token, uuid=this_uuid,
prompt_token=flow_prompt_speech_token.to(device),
prompt_feat=prompt_speech_feat.to(device),
finalize=is_finalize)
prev_mel = tts_mel
audio_bytes = audio_processor.process(tts_speech.clone().cpu().numpy()[0], last=is_finalize)
tts_speechs.append(tts_speech.squeeze())
tts_mels.append(tts_mel)
if audio_bytes:
yield history, inputs, '', '', audio_bytes, None
flow_prompt_speech_token = torch.cat((flow_prompt_speech_token, tts_token), dim=-1)
audio_tokens = []
if not is_finalize:
complete_tokens.append(token_id)
if token_id >= audio_offset:
audio_tokens.append(token_id - audio_offset)
else:
text_tokens.append(token_id)
tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
complete_text = glm_tokenizer.decode(complete_tokens, spaces_between_special_tokens=False)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
torchaudio.save(f, tts_speech.unsqueeze(0), 22050, format="wav")
history.append({"role": "assistant", "content": {"path": f.name, "type": "audio/wav"}})
history.append({"role": "assistant", "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False)})
yield history, inputs, complete_text, '', None, (22050, tts_speech.numpy())
def update_input_interface(input_mode):
if input_mode == "audio":
return [gr.update(visible=True), gr.update(visible=False)]
else:
return [gr.update(visible=False), gr.update(visible=True)]
# Create the Gradio interface
with gr.Blocks(title="GLM-4-Voice Demo", fill_height=True) as demo:
with gr.Row():
temperature = gr.Number(
label="Temperature",
value=0.2
)
top_p = gr.Number(
label="Top p",
value=0.8
)
max_new_token = gr.Number(
label="Max new tokens",
value=2000,
)
chatbot = gr.Chatbot(
elem_id="chatbot",
bubble_full_width=False,
type="messages",
scale=1,
)
with gr.Row():
with gr.Column():
input_mode = gr.Radio(["audio", "text"], label="Input Mode", value="audio")
audio = gr.Audio(label="Input audio", type='filepath', show_download_button=True, visible=True)
text_input = gr.Textbox(label="Input text", placeholder="Enter your text here...", lines=2, visible=False)
with gr.Column():
submit_btn = gr.Button("Submit")
reset_btn = gr.Button("Clear")
output_audio = gr.Audio(label="Play", streaming=True,
autoplay=True, show_download_button=False)
complete_audio = gr.Audio(label="Last Output Audio (If Any)", show_download_button=True)
gr.Markdown("""## Debug Info""")
with gr.Row():
input_tokens = gr.Textbox(
label=f"Input Tokens",
interactive=False,
)
completion_tokens = gr.Textbox(
label=f"Completion Tokens",
interactive=False,
)
detailed_error = gr.Textbox(
label=f"Detailed Error",
interactive=False,
)
history_state = gr.State([])
respond = submit_btn.click(
inference_fn,
inputs=[
temperature,
top_p,
max_new_token,
input_mode,
audio,
text_input,
history_state,
input_tokens,
completion_tokens,
],
outputs=[history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]
)
respond.then(lambda s: s, [history_state], chatbot)
reset_btn.click(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio])
input_mode.input(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]).then(update_input_interface, inputs=[input_mode], outputs=[audio, text_input])
initialize_fn()
# Launch the interface
demo.launch(
server_port=args.port,
server_name=args.host
)
# seed-tts-eval
:boom: This repository contains the objective test set as proposed in our project, [seed-TTS](https://arxiv.org/abs/2406.02430), along with the scripts for metric calculations. Due to considerations for AI safety, we will NOT be releasing the source code and model weights of seed-TTS. We invite you to experience the speech generation feature within ByteDance products. :boom:
To evaluate the zero-shot speech generation ability of our model, we propose an out-of-domain objective evaluation test set. This test set consists of samples extracted from English (EN) and Mandarin (ZH) public corpora that are used to measure the model's performance on various objective metrics. Specifically, we employ 1,000 samples from the [Common Voice](https://commonvoice.mozilla.org/en) dataset and 2,000 samples from the [DiDiSpeech-2](https://arxiv.org/pdf/2010.09275) dataset.
## Requirements
To install all dependencies, run
```
pip3 install -r requirements.txt
```
## Metrics
The word error rate (WER) and speaker similarity (SIM) metrics are adopted for objective evaluation.
* For WER, we employ [Whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) and [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh) as the automatic speech recognition (ASR) engines for English and Mandarin, respectively.
* For SIM, we use WavLM-large fine-tuned on the speaker verification task ([model link](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view)) to obtain speaker embeddings used to calculate the cosine similarity of speech samples of each test utterance against reference clips.
## Dataset
You can download the test set for all tasks from [this link](https://drive.google.com/file/d/1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP/edit).
The test set is mainly organized using the method of meta file. The meaning of each line in the meta file: filename | the text of the prompt | the audio of the prompt | the text to be synthesized | the ground truth counterpart corresponding to the text to be synthesized (if exists). For different tasks, we adopt different meta files:
* Zero-shot text-to-speech (TTS):
* EN: en/meta.lst
* ZH: zh/meta.lst
* ZH (hard case): zh/hardcase.lst
* Zero-shot voice conversion (VC):
* EN: en/non_para_reconstruct_meta.lst
* ZH: zh/non_para_reconstruct_meta.lst
## Code
We also release the evaluation code for both metrics:
```
# WER
bash cal_wer.sh {the path of the meta file} {the directory of synthesized audio} {language: zh or en}
# SIM
bash cal_sim.sh {the path of the meta file} {the directory of synthesized audio} {path/wavlm_large_finetune.pth}
```
import sys
import numpy as np
infile=sys.argv[1]
outfile=sys.argv[2]
fout = open(outfile, "w")
fout.write("utt" + '\t' + "wav_res" + '\t' + 'res_wer' + '\t' + 'text_ref' + '\t' + 'text_res' + '\t' + 'res_wer_ins' + '\t' + 'res_wer_del' + '\t' + 'res_wer_sub' + '\n')
wers = []
wers_below50 = []
inses = []
deles = []
subses = []
n_higher_than_50 = 0
for line in open(infile, "r").readlines():
wav_path, wer, text_ref, text_res, inse, dele, subs = line.strip().split("\t")
if float(wer) > 0.5:
n_higher_than_50 += 1
else:
wers_below50.append(float(wer))
wers.append(float(wer))
inses.append(float(inse))
deles.append(float(dele))
subses.append(float(subs))
fout.write(line)
wer = round(np.mean(wers)*100,3)
wer_below50 = round(np.mean(wers_below50)*100,3)
subs = round(np.mean(subses)*100,3)
dele = round(np.mean(deles)*100,3)
inse = round(np.mean(inses)*100,3)
subs_ratio = round(subs / wer, 3)
dele_ratio = round(dele / wer, 3)
inse_ratio = round(inse / wer, 3)
fout.write(f"WER: {wer}%\n")
fout.close()
print(f"WER: {wer}%\n")
set -x
set -e
meta_lst=$1
output_dir=$2
checkpoint_path=$3
wav_wav_text=$output_dir/wav_res_ref_text
score_file=$output_dir/wav_res_ref_text.wer
python3 get_wav_res_ref_text.py $meta_lst $output_dir $output_dir/wav_res_ref_text
workdir=$(cd $(dirname $0); pwd)
cd $workdir/thirdparty/UniSpeech/downstreams/speaker_verification/
timestamp=$(date +%s)
thread_dir=/tmp/thread_metas_$timestamp/
mkdir $thread_dir
num_job=$ARNOLD_WORKER_GPU
num=`wc -l $wav_wav_text | awk -F' ' '{print $1}'`
num_per_thread=`expr $num / $num_job + 1`
split -l $num_per_thread --additional-suffix=.lst -d $wav_wav_text $thread_dir/thread-
out_dir=/tmp/thread_metas_$timestamp/results/
mkdir $out_dir
num_job_minus_1=`expr $num_job - 1`
if [ ${num_job_minus_1} -ge 0 ];then
for rank in $(seq 0 $((num_job - 1))); do
python3 verification_pair_list_v2.py $thread_dir/thread-0$rank.lst \
--model_name wavlm_large \
--checkpoint $checkpoint_path \
--scores $out_dir/thread-0$rank.sim.out \
--wav1_start_sr 0 \
--wav2_start_sr 0 \
--wav1_end_sr -1 \
--wav2_end_sr -1 \
--device cuda:$rank &
done
fi
wait
rm $wav_wav_text
rm -f $out_dir/merge.out
cat $out_dir/thread-0*.sim.out | grep -v "avg score" >> $out_dir/merge.out
python3 average.py $out_dir/merge.out $score_file
set -x
set -e
meta_lst=$1
output_dir=$2
lang=$3
wav_wav_text=$output_dir/wav_res_ref_text
score_file=$output_dir/wav_res_ref_text.wer
workdir=$(cd $(dirname $0); cd ../; pwd)
python3 get_wav_res_ref_text.py $meta_lst $output_dir $wav_wav_text
python3 prepare_ckpt.py
timestamp=$(date +%s)
thread_dir=/tmp/thread_metas_$timestamp/
mkdir $thread_dir
num_job=$ARNOLD_WORKER_GPU
num=`wc -l $wav_wav_text | awk -F' ' '{print $1}'`
num_per_thread=`expr $num / $num_job + 1`
split -l $num_per_thread --additional-suffix=.lst -d $wav_wav_text $thread_dir/thread-
out_dir=/tmp/thread_metas_$timestamp/results/
mkdir $out_dir
num_job_minus_1=`expr $num_job - 1`
if [ ${num_job_minus_1} -ge 0 ];then
for rank in $(seq 0 $((num_job - 1))); do
sub_score_file=$out_dir/thread-0$rank.wer.out
CUDA_VISIBLE_DEVICES=$rank python3 run_wer.py $thread_dir/thread-0$rank.lst $sub_score_file $lang &
done
fi
wait
#rm $wav_wav_text
#rm -f $out_dir/merge.out
cat $out_dir/thread-0*.wer.out >> $out_dir/merge.out
python3 average_wer.py $out_dir/merge.out $score_file
import sys, os
from tqdm import tqdm
metalst = sys.argv[1]
wav_dir = sys.argv[2]
wav_res_ref_text = sys.argv[3]
f = open(metalst)
lines = f.readlines()
f.close()
f_w = open(wav_res_ref_text, 'w')
for line in tqdm(lines):
if len(line.strip().split('|')) == 5:
utt, prompt_text, prompt_wav, infer_text, infer_wav = line.strip().split('|')
elif len(line.strip().split('|')) == 4:
utt, prompt_text, prompt_wav, infer_text = line.strip().split('|')
elif len(line.strip().split('|')) == 2:
utt, infer_text = line.strip().split('|')
elif len(line.strip().split('|')) == 3:
utt, infer_text, prompt_wav = line.strip().split('|')
if utt.endswith(".wav"):
utt = utt[:-4]
if not os.path.exists(os.path.join(wav_dir, utt + '.wav')):
continue
# tmp
#prompt_wav = infer_wav
if not os.path.isabs(prompt_wav):
prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
# if not os.path.isabs(infer_wav):
# infer_wav = os.path.join(os.path.dirname(metalst), infer_wav)
if len(line.strip().split('|')) == 2:
out_line = '|'.join([os.path.join(wav_dir, utt + '.wav'), infer_text])
else:
out_line = '|'.join([os.path.join(wav_dir, utt + '.wav'), prompt_wav, infer_text])
f_w.write(out_line + '\n')
f_w.close()
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from funasr import AutoModel
device="cuda:0"
model_id = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
model = AutoModel(model="paraformer-zh")
funasr
zhconv
modelscope
librosa
torch
torchaudio
import sys, os
from tqdm import tqdm
import multiprocessing
from jiwer import compute_measures
from zhon.hanzi import punctuation
import string
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import scipy
import zhconv
from funasr import AutoModel
punctuation_all = punctuation + string.punctuation
wav_res_text_path = sys.argv[1]
res_path = sys.argv[2]
lang = sys.argv[3] # zh or en
device = "cuda:0"
def load_en_model():
model_id = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
return processor, model
def load_zh_model():
model = AutoModel(model="paraformer-zh")
return model
def process_one(hypo, truth):
raw_truth = truth
raw_hypo = hypo
for x in punctuation_all:
if x == '\'':
continue
truth = truth.replace(x, '')
hypo = hypo.replace(x, '')
truth = truth.replace(' ', ' ')
hypo = hypo.replace(' ', ' ')
if lang == "zh":
truth = " ".join([x for x in truth])
hypo = " ".join([x for x in hypo])
elif lang == "en":
truth = truth.lower()
hypo = hypo.lower()
else:
raise NotImplementedError
measures = compute_measures(truth, hypo)
ref_list = truth.split(" ")
wer = measures["wer"]
subs = measures["substitutions"] / len(ref_list)
dele = measures["deletions"] / len(ref_list)
inse = measures["insertions"] / len(ref_list)
return (raw_truth, raw_hypo, wer, subs, dele, inse)
def run_asr(wav_res_text_path, res_path):
if lang == "en":
processor, model = load_en_model()
elif lang == "zh":
model = load_zh_model()
params = []
for line in open(wav_res_text_path).readlines():
line = line.strip()
if len(line.split('|')) == 2:
wav_res_path, text_ref = line.split('|')
elif len(line.split('|')) == 3:
wav_res_path, wav_ref_path, text_ref = line.split('|')
elif len(line.split('|')) == 4: # for edit
wav_res_path, _, text_ref, wav_ref_path = line.split('|')
else:
raise NotImplementedError
if not os.path.exists(wav_res_path):
continue
params.append((wav_res_path, text_ref))
fout = open(res_path, "w")
n_higher_than_50 = 0
wers_below_50 = []
for wav_res_path, text_ref in tqdm(params):
if lang == "en":
wav, sr = sf.read(wav_res_path)
if sr != 16000:
wav = scipy.signal.resample(wav, int(len(wav) * 16000 / sr))
input_features = processor(wav, sampling_rate=16000, return_tensors="pt").input_features
input_features = input_features.to(device)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="english", task="transcribe")
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
elif lang == "zh":
res = model.generate(input=wav_res_path,
batch_size_s=300)
transcription = res[0]["text"]
transcription = zhconv.convert(transcription, 'zh-cn')
raw_truth, raw_hypo, wer, subs, dele, inse = process_one(transcription, text_ref)
fout.write(f"{wav_res_path}\t{wer}\t{raw_truth}\t{raw_hypo}\t{inse}\t{dele}\t{subs}\n")
fout.flush()
run_asr(wav_res_text_path, res_path)
# ILS-SSL
> [**ILS-SSL**](https://arxiv.org/pdf/2112.08778.pdf): Self-Supervised Learning for Speech Recognition with Intermediate Layer Supervision
The data preparation and pre-training for the first iteration follow the same pipeline as Hubert. We give example scripts for ILS-Hubert pre-training and fine-tuning in src/examples/hubert/scripts
## Pre-Trained and Fine-tuned Models
Model | Pretraining Dataset | Finetuning Dataset | Model
|---|---|---|---
ILS-Base | 960h LibriSpeech | - | [Download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/el_hubert_4_12/checkpoint_best.pt?st=2022-01-04T08%3A05%3A24Z&se=2024-01-05T08%3A05%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=JI8ZOgBhrrKUY4DE2ommnKpyAUuX6OrHfWgdjAT2Xnc%3D)
ILS-Large | 60k hrs Libri-Light | - | [Download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/ils_hubert_large/checkpoint_fixed.pt?st=2022-01-04T08%3A24%3A37Z&se=2025-01-05T08%3A24%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=Dv6svAaI7Td%2BZWUTjTFkhChFbpnAAU6xKNjPbPQnIKM%3D)
ILS-Large | 60k hrs Libri-Light | 960h LibriSpeech | [Download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/ils_hubert_large/checkpoint_ft.pt?st=2022-01-04T08%3A40%3A17Z&se=2025-01-05T08%3A40%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=GKIe%2F1kz%2F1fjGTsQsakJy68jlsFDbKmIVYjH61dhrwA%3D)
## Results on Librispeech
Base Model | Finetuning set| LM | test-clean | test-other
|---|---|---|---|---
wav2vec2.0 | 1 hour | None | 24.5 | 29.7
Hubert | 1 hour | None| 20.9 | 27.5
ILS-SSL | 1 hour | None | 17.9 | 23.1
wav2vec2.0 | 1 hour | 4-gram | 5.5 | 11.3
Hubert | 1 hour | 4-gram | 6.1 | 11.3
ILS-SSL | 1 hour | 4-gram | 5.4 | 10.2
wav2vec2.0 | 10 hour | None | 11.1 | 17.6
Hubert | 10 hour | None| 10.1 | 16.8
ILS-SSL | 10 hour | None | 8.3 | 13.6
wav2vec2.0 | 10 hour | 4-gram | 4.3 | 9.5
Hubert | 10 hour | 4-gram | 4.3 | 9.4
ILS-SSL | 10 hour | 4-gram | 3.8 | 8.1
wav2vec2.0 | 100 hour | None | 6.1 | 13.3
Hubert | 100 hour | None| 6.3 | 13.2
ILS-SSL | 100 hour | None | 4.7 | 10.1
wav2vec2.0 | 100 hour | 4-gram | 3.4| 8.0
Hubert | 100 hour | 4-gram | 3.4 | 8.1
ILS-SSL | 100 hour | 4-gram | 3.0 | 6.9
Large Model | Finetuning set| LM | test-clean | test-other
|---|---|---|---|---
wav2vec2.0 | 1 hour | None | 17.2 | 20.3
Hubert | 1 hour | None| 17.4 | 20.3
ILS-SSL | 1 hour | None | 14.3 | 16.9
wav2vec2.0 | 1 hour | Transf | 2.9 | 5.8
Hubert | 1 hour | Transf | 2.9 | 5.4
ILS-SSL | 1 hour | Transf | 2.8 | 5.3
wav2vec2.0 | 10 hour | None | 6.3 | 10.0
Hubert | 10 hour | None | 6.2 | 9.6
ILS-SSL | 10 hour | None | 6.1 | 9.1
wav2vec2.0 | 10 hour | Transf | 2.6 | 4.9
Hubert | 10 hour | Transf | 2.4 | 4.6
ILS-SSL | 10 hour | Transf | 2.5 | 4.5
wav2vec2.0 | 100 hour | None | 3.1 | 6.3
Hubert | 100 hour | None| 2.9 | 6.0
ILS-SSL | 100 hour | None | 2.9 | 5.8
wav2vec2.0 | 100 hour | Transf | 2.0 | 4.0
Hubert | 100 hour | Transf | 2.1 | 3.9
ILS-SSL | 100 hour | Transf | 2.0 | 4.0
wav2vec2.0 | 960 hour | None | 2.2 | 4.5
Hubert | 960 hour | None | 2.1 | 4.3
ILS-SSL | 960 hour | None | 1.9 | 3.8
wav2vec2.0 | 960 hour | Transf | 1.8 | 3.3
Hubert | 960 hour | Transf | 1.9 | 3.3
ILS-SSL | 960 hour | Transf | 1.8 | 3.2
Attribution-ShareAlike 3.0 Unported
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
License
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
1. Definitions
"Adaptation" means a work based upon the Work, or upon the Work and other pre-existing works, such as a translation, adaptation, derivative work, arrangement of music or other alterations of a literary or artistic work, or phonogram or performance and includes cinematographic adaptations or any other form in which the Work may be recast, transformed, or adapted including in any form recognizably derived from the original, except that a work that constitutes a Collection will not be considered an Adaptation for the purpose of this License. For the avoidance of doubt, where the Work is a musical work, performance or phonogram, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered an Adaptation for the purpose of this License.
"Collection" means a collection of literary or artistic works, such as encyclopedias and anthologies, or performances, phonograms or broadcasts, or other works or subject matter other than works listed in Section 1(f) below, which, by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. A work that constitutes a Collection will not be considered an Adaptation (as defined below) for the purposes of this License.
"Creative Commons Compatible License" means a license that is listed at https://creativecommons.org/compatiblelicenses that has been approved by Creative Commons as being essentially equivalent to this License, including, at a minimum, because that license: (i) contains terms that have the same purpose, meaning and effect as the License Elements of this License; and, (ii) explicitly permits the relicensing of adaptations of works made available under that license under this License or a Creative Commons jurisdiction license with the same License Elements as this License.
"Distribute" means to make available to the public the original and copies of the Work or Adaptation, as appropriate, through sale or other transfer of ownership.
"License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.
"Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License.
"Original Author" means, in the case of a literary or artistic work, the individual, individuals, entity or entities who created the Work or if no individual or entity can be identified, the publisher; and in addition (i) in the case of a performance the actors, singers, musicians, dancers, and other persons who act, sing, deliver, declaim, play in, interpret or otherwise perform literary or artistic works or expressions of folklore; (ii) in the case of a phonogram the producer being the person or legal entity who first fixes the sounds of a performance or other sounds; and, (iii) in the case of broadcasts, the organization that transmits the broadcast.
"Work" means the literary and/or artistic work offered under the terms of this License including without limitation any production in the literary, scientific and artistic domain, whatever may be the mode or form of its expression including digital form, such as a book, pamphlet and other writing; a lecture, address, sermon or other work of the same nature; a dramatic or dramatico-musical work; a choreographic work or entertainment in dumb show; a musical composition with or without words; a cinematographic work to which are assimilated works expressed by a process analogous to cinematography; a work of drawing, painting, architecture, sculpture, engraving or lithography; a photographic work to which are assimilated works expressed by a process analogous to photography; a work of applied art; an illustration, map, plan, sketch or three-dimensional work relative to geography, topography, architecture or science; a performance; a broadcast; a phonogram; a compilation of data to the extent it is protected as a copyrightable work; or a work performed by a variety or circus performer to the extent it is not otherwise considered a literary or artistic work.
"You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
"Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images.
"Reproduce" means to make copies of the Work by any means including without limitation by sound or visual recordings and the right of fixation and reproducing fixations of the Work, including storage of a protected performance or phonogram in digital form or other electronic medium.
2. Fair Dealing Rights. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright or rights arising from limitations or exceptions that are provided for in connection with the copyright protection under copyright law or other applicable laws.
3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
to Reproduce the Work, to incorporate the Work into one or more Collections, and to Reproduce the Work as incorporated in the Collections;
to create and Reproduce Adaptations provided that any such Adaptation, including any translation in any medium, takes reasonable steps to clearly label, demarcate or otherwise identify that changes were made to the original Work. For example, a translation could be marked "The original work was translated from English to Spanish," or a modification could indicate "The original work has been modified.";
to Distribute and Publicly Perform the Work including as incorporated in Collections; and,
to Distribute and Publicly Perform Adaptations.
For the avoidance of doubt:
Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License;
Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor waives the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; and,
Voluntary License Schemes. The Licensor waives the right to collect royalties, whether individually or, in the event that the Licensor is a member of a collecting society that administers voluntary licensing schemes, via that society, from any exercise by You of the rights granted under this License.
The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. Subject to Section 8(f), all rights not expressly granted by Licensor are hereby reserved.
4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from any Licensor You must, to the extent practicable, remove from the Collection any credit as required by Section 4(c), as requested. If You create an Adaptation, upon notice from any Licensor You must, to the extent practicable, remove from the Adaptation any credit as required by Section 4(c), as requested.
You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; (iii) a Creative Commons jurisdiction license (either this or a later license version) that contains the same License Elements as this License (e.g., Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible License. If you license the Adaptation under one of the licenses mentioned in (iv), you must comply with the terms of that license. If you license the Adaptation under the terms of any of the licenses mentioned in (i), (ii) or (iii) (the "Applicable License"), you must comply with the terms of the Applicable License generally and the following provisions: (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform; (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License; (III) You must keep intact all notices that refer to the Applicable License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform; (IV) when You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License.
If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribution ("Attribution Parties") in Licensor's copyright notice, terms of service or by other reasonable means, the name of such party or parties; (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and (iv) , consistent with Ssection 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). The credit required by this Section 4(c) may be implemented in any reasonable manner; provided, however, that in the case of a Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributing authors of the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributing authors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Original Author, Licensor and/or Attribution Parties, as appropriate, of You or Your use of the Work, without the separate, express prior written permission of the Original Author, Licensor and/or Attribution Parties.
Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right granted in Section 3(b) of this License (the right to make Adaptations) would be deemed to be a distortion, mutilation, modification or other derogatory action prejudicial to the Original Author's honor and reputation, the Licensor will waive or not assert, as appropriate, this Section, to the fullest extent permitted by the applicable national law, to enable You to reasonably exercise Your right under Section 3(b) of this License (right to make Adaptations) but not otherwise.
5. Representations, Warranties and Disclaimer
UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. Termination
This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
8. Miscellaneous
Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in which the License terms are sought to be enforced according to the corresponding provisions of the implementation of those treaty provisions in the applicable national law. If the standard suite of rights granted under applicable copyright law includes additional rights not granted under this License, such additional rights are deemed to be included in the License; this License is not intended to restrict the license of any rights under applicable law.
Creative Commons Notice
Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.
Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of the License.
Creative Commons may be contacted at https://creativecommons.org/.
\ No newline at end of file
# UniSpeech
<!--**Pre-trained models for speech related tasks**-->
The family of UniSpeech:
> [**WavLM**](https://arxiv.org/pdf/2110.13900.pdf) (```arXiv```): **WavLM: Large-Scale Self-Supervised Pre-training for Full Stack Speech Processing**
> [**UniSpeech**](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech) (```ICML 2021```): **Unified Pre-training for Self-Supervised Learning and Supervised Learning for ASR**
> [**UniSpeech-SAT**](https://arxiv.org/pdf/2110.05752.pdf) (```ICASSP 2022 Submission```): **Universal Speech Representation Learning with Speaker Aware Pre-Training**
> [**ILS-SSL**](https://arxiv.org/pdf/2112.08778.pdf) (```ICASSP 2022 Submission```): **Self-Supervised Learning for Speech Recognition with Intermediate Layer Supervision**
Model introductions, evaluation results, and model inference instructions are located in their corresponding folders. The source code is here [https://github.com/microsoft/UniSpeech/tree/main/src].
## Update
- [HuggingFace Integration] Dec 23, 2021: [**WavLM**](https://huggingface.co/models?other=wavlm) models are on [HuggingFace](https://huggingface.co/models?other=wavlm) .
- [HuggingFace Integration] Octorber 26, 2021: [**UniSpeech-SAT**](https://huggingface.co/microsoft/unispeech-sat-large) models are on [HuggingFace](https://huggingface.co/models?other=unispeech-sat) .
- [Model Release] Octorber 13, 2021: [**UniSpeech-SAT**](https://arxiv.org/pdf/2110.05752.pdf) models are releaseed.
- [HuggingFace Integration] Octorber 11, 2021: [**UniSpeech**](https://huggingface.co/microsoft/unispeech-large-1500h-cv) models are on [HuggingFace](https://huggingface.co/models?other=unispeech) .
- [Model Release] June, 2021: [**UniSpeech v1**](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech) models are released.
## Pre-trained models
We strongly suggest using our UniSpeech-SAT model for speaker related tasks, since it shows very powerful performance on various speaker related benchmarks.
Model | Pretraining Dataset | Finetuning Dataset | Model
|---|---|---|-----
UniSpeech Large EN | [Labeled: 1350 hrs en](https://commonvoice.mozilla.org/) | - | [download](https://releasemodel.blob.core.windows.net/models/CommonVoicePretrainedModel/CommonVoiceEnglishPretrainedModel/checkpoint_best.pt?sv=2019-12-12&st=2021-07-14T09%3A00%3A07Z&se=2022-07-15T09%3A00%3A00Z&sr=b&sp=r&sig=5sxvEwVRoGtkazNQYkOuFLlPYau8nl5Ng%2FfRJa0Vnc4%3D)
UniSpeech Large Multilingual | [Labeled: 1350 hrs en + 353 hrs fr + 168 hrs es + 90 hrs it](https://commonvoice.mozilla.org/) | - | [download](https://releasemodel.blob.core.windows.net/models/CommonVoicePretrainedModel/CommonVoiceMultilingualPretrainedModel/checkpoint_best.pt?sv=2019-12-12&st=2021-07-14T09%3A00%3A39Z&se=2022-07-15T09%3A00%3A00Z&sr=b&sp=r&sig=y%2Fd3rqtbyqW0ZCwR7Czho5any90khA%2Ft3w9PTZ6N9vU%3D)
Unispeech Large+ | [Labeled: 1350 hrs en, Unlabeled: 353 hrs fr](https://commonvoice.mozilla.org/) | - | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/pt_fr353.large.one2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A44%3A54Z&se=2023-10-26T06%3A44%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=7tYuYMxVFfM2Vgi%2BoqUh%2ByJXD4hSuoafHgBP5VZApw0%3D)
UniSpeech Large+ | [Labeld: 1350 hrs en, Unlabeled: 168 hrs es](https://commonvoice.mozilla.org/) | - | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/pt_es168.large.one2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A39%3A37Z&se=2023-10-26T06%3A39%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=T2B5%2BlOI6v64TNdLSe9rdp3R%2B9Q2E35taUOigGW0nsQ%3D)
UniSpeech Large+ | [Labeled: 1350 hrs en, Unlabeld: 90 hrs it](https://commonvoice.mozilla.org/) | -| [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/pt_it90.large.one2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A52%3A08Z&se=2023-10-26T06%3A52%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=kXsSJXK9r8UEYlUr2LaJxtPf8m9J2G23MfG725k2DBk%3D)
UniSpeech Large Multilingual | [Labeled: 1350 hrs en + 353 hrs fr + 168 hrs es + 90 hrs it, Unlabeled: 17 hrs ky](https://commonvoice.mozilla.org/) | - | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/pt_ky17.large.many2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A53%3A00Z&se=2022-10-26T06%3A53%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=oCQecalXzC5daaurLLJGQdFNtfYwsBM6pNQrDAsf5i0%3D)
UniSpeech Large+ | [Labeled: 1350 hrs en, Unlabeled: 353 hrs fr](https://commonvoice.mozilla.org/) | 1 hr fr | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/ft_fr-pt_fr353.large.one2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A27%3A53Z&se=2023-10-26T06%3A27%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=9vEa3xqzWu7SYkACn9TQqDtcm%2BKmUcOHhabjbjZuPys%3D)
UniSpeech Large+ | [Labeld: 1350 hrs en, Unlabeled: 168 hrs es](https://commonvoice.mozilla.org/) | 1 hr es | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/ft_es-pt_es168.large.one2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A21%3A34Z&se=2024-10-26T06%3A21%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=G%2B0RddgOh653UzXG95Ljuwv7aG3tu9gXtPXn1ixCiug%3D)
UniSpeech Large+ | [Labeled: 1350 hrs en, Unlabeld: 90 hrs it](https://commonvoice.mozilla.org/) | 1 hr it | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/ft_it-pt_it90.large.one2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A36%3A17Z&se=2023-10-26T06%3A36%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=e1WD9uOCo9sCAdH%2FPZQ4wCD30aCDpZvvu43kJrqq2HE%3D)
UniSpeech Large Multilingual | [Labeled: 1350 hrs en + 353 hrs fr + 168 hrs es + 90 hrs it, Unlabeled: 17 hrs ky](https://commonvoice.mozilla.org/) | 1 hr ky | [download](https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-chengw/models/pt_ky17.large.many2one_unispeech/checkpoint_best.pt?st=2021-10-25T06%3A54%3A04Z&se=2023-10-26T06%3A54%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=2K3VjMcsbKfBkLVyDlqGhVpIX%2B2ZcA5DTlMhjdkXo3g%3D)
UniSpeech-SAT Base | [960 hrs LibriSpeech](http://www.openslr.org/12) | - | [download](https://drive.google.com/file/d/1l5etRW6W2aP_8I2Fs_8ailGZqEzdrAPz/view?usp=sharing)
UniSpeech-SAT Base+ | [60k hrs Libri-Light](https://github.com/facebookresearch/libri-light) + [10k hrs GigaSpeech](https://github.com/SpeechColab/GigaSpeech) + [24k hrs VoxPopuli](https://github.com/facebookresearch/voxpopuli/tree/main) | - | [download](https://drive.google.com/file/d/1Q1MLVfyOHkSzTjyD-mzSZVjhndEmCvef/view?usp=sharing)
UniSpeech-SAT Large | [60k hrs Libri-Light](https://github.com/facebookresearch/libri-light) + [10k hrs GigaSpeech](https://github.com/SpeechColab/GigaSpeech) + [24k hrs VoxPopuli](https://github.com/facebookresearch/voxpopuli/tree/main) | - | [download](https://drive.google.com/file/d/12ScE1G2W-AHcccyBb_0uVI6qpFVQ0PaI/view?usp=sharing)
WavLM Base | [960 hrs LibriSpeech](http://www.openslr.org/12)| - | [Azure Storage](https://msranlcmtteamdrive.blob.core.windows.net/share/wavlm/WavLM-Base.pt?sv=2020-04-08&st=2021-11-05T00%3A35%3A31Z&se=2022-11-06T00%3A35%3A00Z&sr=b&sp=r&sig=JljnRVzyHY6AjHzhVmHV5KyQQCvvGfgp9D2M02oGJBU%3D) <br> [Google Drive](https://drive.google.com/file/d/19-C7SMQvEFAYLG5uc47NX_MY03JCbI4x/view?usp=sharing)
WavLM Base+ | [60k hrs Libri-Light](https://github.com/facebookresearch/libri-light) + [10k hrs GigaSpeech](https://github.com/SpeechColab/GigaSpeech) + [24k hrs VoxPopuli](https://github.com/facebookresearch/voxpopuli/tree/main)| - | [Azure Storage](https://msranlcmtteamdrive.blob.core.windows.net/share/wavlm/WavLM-Base+.pt?sv=2020-04-08&st=2021-11-05T00%3A34%3A47Z&se=2022-10-06T00%3A34%3A00Z&sr=b&sp=r&sig=Gkf1IByHaIn1t%2FVEd9D6WHjZ3zu%2Fk5eSdoj21UytKro%3D) <br> [Google Drive](https://drive.google.com/file/d/1PlbT_9_B4F9BsD_ija84sUTVw7almNX8/view?usp=sharing)
WavLM Large | [60k hrs Libri-Light](https://github.com/facebookresearch/libri-light) + [10k hrs GigaSpeech](https://github.com/SpeechColab/GigaSpeech) + [24k hrs VoxPopuli](https://github.com/facebookresearch/voxpopuli/tree/main)| - | [Azure Storage](https://msranlcmtteamdrive.blob.core.windows.net/share/wavlm/WavLM-Large.pt?sv=2020-08-04&st=2021-11-22T10%3A03%3A53Z&se=2022-11-23T10%3A03%3A00Z&sr=b&sp=r&sig=3kB8dwTCyIS8YQ7gW5oXmDrXV%2FAaLmoxBS37oPpFsz4%3D) <br> [Google Drive](https://drive.google.com/file/d/1rMu6PQ9vz3qPz4oIm72JDuIr5AHIbCOb/view?usp=sharing)
## Universal Representation Evaluation on SUPERB
![alt text](WavLM/WavLM_SUPERB_Results.png)
## Downstream Task Performance
We also evaluate our models on typical speaker related benchmarks.
### Speaker Verification
Finetune the model with VoxCeleb2 dev data, and evaluate it on the [VoxCeleb1](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/#:~:text=VoxCeleb%20is%20an%20audio%2Dvisual,interview%20videos%20uploaded%20to%20YouTube)
| Model |Fix pre-train| Vox1-O | Vox1-E | Vox1-H |
| ------------- |------------- | ---------- | ---------- | ---------- |
| ECAPA-TDNN | - | 0.87 | 1.12 | 2.12 |
| HuBERT large | Yes| 0.888 |0.912| 1.853 |
| Wav2Vec2.0 (XLSR)| Yes | 0.915| 0.945 |1.895|
| UniSpeech-SAT large | Yes | 0.771 | 0.781| 1.669|
| WavLM large | Yes | 0.59 | 0.65| 1.328|
| WavLM large | No | 0.505 | 0.579| 1.176|
|+Large Margin Finetune and Score Calibration|
| HuBERT large | No| 0.585| 0.654 |1.342|
| Wav2Vec2.0 (XLSR) | No| 0.564| 0.605 |1.23|
| UniSpeech-SAT large | No | 0.564 | 0.561| 1.23 |
| **WavLM large (New)** | No | **0.33** | **0.477**| **0.984** |
[Large-scale Self-Supervised Speech Representation Learning for Automatic Speaker Verification](https://arxiv.org/pdf/2110.05777.pdf)
### Speech Separation
Evaluation on [LibriCSS](https://github.com/chenzhuo1011/libri_css)
| Model |0S | 0L | OV10 | OV20 |OV30 |OV40 |
| ---------------- |------| ------ | ------ | ------ | ------ | ------ |
| [Conformer](https://ieeexplore.ieee.org/abstract/document/9413423/) (SOTA) | 4.5 | 4.4 |6.2 |8.5| 11 |12.6|
| UniSpeech-SAT base | 4.4| 4.4 |5.4| 7.2| 9.2 |10.5|
| UniSpeech-SAT large | 4.3| 4.2 |5.0 |6.3| 8.2| 8.8|
| WavLM base+ | 4.5| 4.4 |5.6| 7.5| 9.4 |10.9|
| **WavLM large** | 4.2| 4.1 | 4.8 | 5.8 | 7.4| 8.5|
### Speaker Diarization
Evaluation on CALLHOME
| Model |spk_2 |spk_3| spk_4| spk_5| spk_6| spk_all |
| ---------------- |------| ------ | ------ | ------ | ------ | ------ |
| [EEND-vector clustering](https://arxiv.org/pdf/2105.09040.pdf) | 7.96| 11.93 |16.38| 21.21| 23.1 |12.49||
| [EEND-EDA clustering](https://arxiv.org/abs/2107.01545) (SOTA) | 7.11| 11.88 |14.37| 25.95| 21.95 |11.84||
| UniSpeech-SAT large | 5.93| 10.66| 12.9 |16.48| 23.25| 10.92|
| WavLM Base| 6.99| 11.12| 15.20 |16.48| 21.61| 11.75|
| **WavLm large** | 6.46| 10.69| 11.84 |12.89| 20.70| 10.35|
## License
This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) project.
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
### Reference
If you find our work is useful in your research, please cite the following paper:
``` latex
@inproceedings{Wang2021UniSpeech,
author = {Chengyi Wang and Yu Wu and Yao Qian and Kenichi Kumatani and Shujie Liu and Furu Wei and Michael Zeng and Xuedong Huang},
editor = {Marina Meila and Tong Zhang},
title = {UniSpeech: Unified Speech Representation Learning with Labeled and
Unlabeled Data},
booktitle = {Proceedings of the 38th International Conference on Machine Learning,
{ICML} 2021, 18-24 July 2021, Virtual Event},
series = {Proceedings of Machine Learning Research},
volume = {139},
pages = {10937--10947},
publisher = {{PMLR}},
year = {2021},
url = {http://proceedings.mlr.press/v139/wang21y.html},
timestamp = {Thu, 21 Oct 2021 16:06:12 +0200},
biburl = {https://dblp.org/rec/conf/icml/0002WQK0WZ021.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
```
``` latex
@article{Chen2021WavLM,
title = {WavLM: Large-Scale Self-Supervised Pre-training for Full Stack Speech Processing},
author = {Sanyuan Chen and Chengyi Wang and Zhengyang Chen and Yu Wu and Shujie Liu and Zhuo Chen and Jinyu Li and Naoyuki Kanda and Takuya Yoshioka and Xiong Xiao and Jian Wu and Long Zhou and Shuo Ren and Yanmin Qian and Yao Qian and Jian Wu and Michael Zeng and Furu Wei},
eprint={2110.13900},
archivePrefix={arXiv},
primaryClass={cs.CL},
year={2021}
}
```
``` latex
@article{Chen2021UniSpeechSAT,
title = {UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training},
author = {Sanyuan Chen and Yu Wu and Chengyi Wang and Zhengyang Chen and Zhuo Chen and Shujie Liu and Jian Wu and Yao Qian and Furu Wei and Jinyu Li and Xiangzhan Yu},
eprint={2110.05752},
archivePrefix={arXiv},
primaryClass={cs.CL},
year={2021}
}
```
### Contact Information
For help or issues using UniSpeech models, please submit a GitHub issue.
For other communications related to UniSpeech, please contact Yu Wu (`yuwu1@microsoft.com`).
# UniSpeech-SAT
> [**UniSpeech-SAT**](https://arxiv.org/pdf/2110.05752.pdf) (```ICASSP 2022 Submission```): **Universal Speech Representation Learning with Speaker Aware Pre-Training**
## Universal Representation Evaluation on SUPERB
![alt text](UniSpeech_SAT_SUPERB_Results.png)
## Downstream Task Performance
We also evaluate our models on typical speaker related benchmarks.
### Speaker Verification
| Model |Fix pre-train| Vox1-O | Vox1-E | Vox1-H |
| ------------- |------------- | ---------- | ---------- | ---------- |
| ECAPA-TDNN | - | 0.87 | 1.12 | 2.12 |
| HuBERT large | Yes| 0.888 |0.912| 1.853 |
| Wav2Vec2.0 (XLSR)| Yes | 0.915| 0.945 |1.895|
| UniSpeech-SAT large | Yes | 0.771 | 0.781| 1.669|
| HuBERT large | No| 0.585| 0.654 |1.342|
| Wav2Vec2.0 (XLSR) | No| **0.564**| 0.605 |**1.23**|
| **UniSpeech-SAT large** | No | **0.564** | **0.561** | **1.23** |
[Our paper for verification](https://arxiv.org/pdf/2110.05777.pdf)
### Speech Separation
Evaluation on [LibriCSS](https://github.com/chenzhuo1011/libri_css)
| Model |0S | 0L | OV10 | OV20 |OV30 |OV40 |
| ---------------- |------| ------ | ------ | ------ | ------ | ------ |
| [Conformer](https://ieeexplore.ieee.org/abstract/document/9413423/) (SOTA) | 4.5 | 4.4 |6.2 |8.5| 11 |12.6|
| HuBERT base | 4.7| 4.6 | 6.1 | 7.9| 10.6| 12.3|
| UniSpeech-SAT base+ | 4.4| 4.4 |5.4| 7.2| 9.2 |10.5|
| **UniSpeech-SAT large** | **4.3**| **4.2** |**5.0** |**6.3**| **8.2**| **8.8**|
### Speaker Diarization
Evaluation on CALLHOME
| Model |spk_2 |spk_3| spk_4| spk_5| spk_6| spk_all |
| ---------------- |------| ------ | ------ | ------ | ------ | ------ |
| [EEND-vector clustering](https://arxiv.org/pdf/2105.09040.pdf) | 7.96| 11.93 |16.38| 21.21| 23.1 |12.49||
| [EEND-EDA clustering](https://arxiv.org/abs/2107.01545) (SOTA) | 7.11| 11.88 |14.37| 25.95| 21.95 |11.84||
| HuBERT base| 7.93|12.07| 15.21 |19.59| 23.32| 12.63|
| HuBERT large| 7.39| 11.97| 15.76 |19.82| 22.10| 12.40|
| **UniSpeech-SAT large** | **5.93**| **10.66**| **12.9** |**16.48**| **23.25**| **10.92**|
## License
This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) project.
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
### Reference
If you find our work is useful in your research, please cite the following paper:
``` latex
@article{Chen2021UniSpeechSAT,
title = {UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware Pre-Training},
author = {Sanyuan Chen and Yu Wu and Chengyi Wang and Zhengyang Chen and Zhuo Chen and Shujie Liu and Jian Wu and Yao Qian and Furu Wei and Jinyu Li and Xiangzhan Yu},
eprint={2110.05752},
archivePrefix={arXiv},
primaryClass={cs.CL},
year={2021}
}
```
### Contact Information
For help or issues using UniSpeech models, please submit a GitHub issue.
For other communications related to UniSpeech, please contact Yu Wu (`yuwu1@microsoft.com`).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment