Commit 72f5785f authored by huaerkl's avatar huaerkl
Browse files

v1.0

parents
Pipeline #505 canceled with stages
# @package _global_
task:
label_dir: ???
labels: ["km"]
model:
label_rate: 100
# @package _global_
task:
label_dir: ???
labels: ["km"]
model:
label_rate: 50
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 32
distributed_port: 29671
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: hubert_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: false # must be consistent with extractor
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
validate_interval_updates: 10000
criterion:
_name: hubert
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
optimization:
max_update: 400000
lr: [0.0005]
clip_norm: 10.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: hubert
label_rate: ???
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: default
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
final_dim: 256
encoder_layerdrop: 0.05
dropout_input: 0.1
dropout_features: 0.1
dropout: 0.1
attention_dropout: 0.1
feature_grad_mult: 0.1
untie_final_proj: true
activation_dropout: 0.0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 128
distributed_port: 29671
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: hubert_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: true # must be consistent with extractor
dataset:
num_workers: 6
max_tokens: 900000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
validate_interval_updates: 10000
criterion:
_name: hubert
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
optimization:
max_update: 400000
lr: [0.0015]
clip_norm: 1.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: hubert
label_rate: ???
encoder_layers: 24
encoder_embed_dim: 1024
encoder_ffn_embed_dim: 4096
encoder_attention_heads: 16
final_dim: 768
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: layer_norm
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
encoder_layerdrop: 0.0
dropout_input: 0.0
dropout_features: 0.0
dropout: 0.0
attention_dropout: 0.0
layer_norm_first: true
feature_grad_mult: 1.0
untie_final_proj: true
activation_dropout: 0.0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
run:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
sweep:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 256
distributed_port: 29671
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: hubert_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: true # must be consistent with extractor
dataset:
num_workers: 6
max_tokens: 360000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
validate_interval_updates: 10000
criterion:
_name: hubert
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
optimization:
max_update: 400000
lr: [0.003]
clip_norm: 1.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: hubert
label_rate: ???
encoder_layers: 48
encoder_embed_dim: 1280
encoder_ffn_embed_dim: 5120
encoder_attention_heads: 16
final_dim: 1024
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: layer_norm
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
encoder_layerdrop: 0.0
dropout_input: 0.0
dropout_features: 0.0
dropout: 0.0
attention_dropout: 0.0
layer_norm_first: true
feature_grad_mult: 1.0
untie_final_proj: true
activation_dropout: 0.0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
run:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
sweep:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _global_
hydra:
launcher:
cpus_per_task: 8
gpus_per_node: 8
tasks_per_node: ${hydra.launcher.gpus_per_node}
nodes: 4
comment: null
mem_gb: 384
timeout_min: 4320
max_num_timeout: 100
constraint: volta32gb
name: ${hydra.job.config_name}/${hydra.job.override_dirname}
submitit_folder: ${hydra.sweep.dir}/submitit/%j
distributed_training:
distributed_world_size: 32
distributed_port: 29671
nprocs_per_node: 8
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import os.path as op
import re
from tabulate import tabulate
from collections import Counter
def comp_purity(p_xy, axis):
max_p = p_xy.max(axis=axis)
marg_p = p_xy.sum(axis=axis)
indv_pur = max_p / marg_p
aggr_pur = max_p.sum()
return indv_pur, aggr_pur
def comp_entropy(p):
return (-p * np.log(p + 1e-8)).sum()
def comp_norm_mutual_info(p_xy):
p_x = p_xy.sum(axis=1, keepdims=True)
p_y = p_xy.sum(axis=0, keepdims=True)
pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8)
mi = (p_xy * pmi).sum()
h_x = comp_entropy(p_x)
h_y = comp_entropy(p_y)
return mi, mi / h_x, mi / h_y, h_x, h_y
def pad(labs, n):
if n == 0:
return np.array(labs)
return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n])
def comp_avg_seg_dur(labs_list):
n_frms = 0
n_segs = 0
for labs in labs_list:
labs = np.array(labs)
edges = np.zeros(len(labs)).astype(bool)
edges[0] = True
edges[1:] = labs[1:] != labs[:-1]
n_frms += len(edges)
n_segs += edges.astype(int).sum()
return n_frms / n_segs
def comp_joint_prob(uid2refs, uid2hyps):
"""
Args:
pad: padding for spliced-feature derived labels
"""
cnts = Counter()
skipped = []
abs_frmdiff = 0
for uid in uid2refs:
if uid not in uid2hyps:
skipped.append(uid)
continue
refs = uid2refs[uid]
hyps = uid2hyps[uid]
abs_frmdiff += abs(len(refs) - len(hyps))
min_len = min(len(refs), len(hyps))
refs = refs[:min_len]
hyps = hyps[:min_len]
cnts.update(zip(refs, hyps))
tot = sum(cnts.values())
ref_set = sorted({ref for ref, _ in cnts.keys()})
hyp_set = sorted({hyp for _, hyp in cnts.keys()})
ref2pid = dict(zip(ref_set, range(len(ref_set))))
hyp2lid = dict(zip(hyp_set, range(len(hyp_set))))
# print(hyp_set)
p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float)
for (ref, hyp), cnt in cnts.items():
p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt
p_xy /= p_xy.sum()
return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped
def read_phn(tsv_path, rm_stress=True):
uid2phns = {}
with open(tsv_path) as f:
for line in f:
uid, phns = line.rstrip().split("\t")
phns = phns.split(",")
if rm_stress:
phns = [re.sub("[0-9]", "", phn) for phn in phns]
uid2phns[uid] = phns
return uid2phns
def read_lab(tsv_path, lab_path, pad_len=0, upsample=1):
"""
tsv is needed to retrieve the uids for the labels
"""
with open(tsv_path) as f:
f.readline()
uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f]
with open(lab_path) as f:
labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f]
assert len(uids) == len(labs_list)
return dict(zip(uids, labs_list))
def main_lab_lab(
tsv_dir,
lab_dir,
lab_name,
lab_sets,
ref_dir,
ref_name,
pad_len=0,
upsample=1,
verbose=False,
):
# assume tsv_dir is the same for both the reference and the hypotheses
tsv_dir = lab_dir if tsv_dir is None else tsv_dir
uid2refs = {}
for s in lab_sets:
uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}"))
uid2hyps = {}
for s in lab_sets:
uid2hyps.update(
read_lab(
f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
)
)
_main(uid2refs, uid2hyps, verbose)
def main_phn_lab(
tsv_dir,
lab_dir,
lab_name,
lab_sets,
phn_dir,
phn_sets,
pad_len=0,
upsample=1,
verbose=False,
):
uid2refs = {}
for s in phn_sets:
uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv"))
uid2hyps = {}
tsv_dir = lab_dir if tsv_dir is None else tsv_dir
for s in lab_sets:
uid2hyps.update(
read_lab(
f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
)
)
_main(uid2refs, uid2hyps, verbose)
def _main(uid2refs, uid2hyps, verbose):
(p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob(
uid2refs, uid2hyps
)
ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0)
hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1)
(mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy)
outputs = {
"ref pur": ref_pur,
"hyp pur": hyp_pur,
"H(ref)": h_ref,
"H(hyp)": h_hyp,
"MI": mi,
"MI/H(ref)": mi_norm_by_ref,
"ref segL": comp_avg_seg_dur(uid2refs.values()),
"hyp segL": comp_avg_seg_dur(uid2hyps.values()),
"p_xy shape": p_xy.shape,
"frm tot": tot,
"frm diff": frmdiff,
"utt tot": len(uid2refs),
"utt miss": len(skipped),
}
print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f"))
if __name__ == "__main__":
"""
compute quality of labels with respect to phone or another labels if set
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("lab_dir")
parser.add_argument("lab_name")
parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+")
parser.add_argument(
"--phn_dir",
default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1",
)
parser.add_argument(
"--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+"
)
parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses")
parser.add_argument(
"--upsample", default=1, type=int, help="upsample factor for hypotheses"
)
parser.add_argument("--ref_lab_dir", default="")
parser.add_argument("--ref_lab_name", default="")
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
if args.ref_lab_dir and args.ref_lab_name:
main_lab_lab(
args.tsv_dir,
args.lab_dir,
args.lab_name,
args.lab_sets,
args.ref_lab_dir,
args.ref_lab_name,
args.pad_len,
args.upsample,
args.verbose,
)
else:
main_phn_lab(
args.tsv_dir,
args.lab_dir,
args.lab_name,
args.lab_sets,
args.phn_dir,
args.phn_sets,
args.pad_len,
args.upsample,
args.verbose,
)
# Sharded Feature Extraction and K-means Application
This folder contains scripts for preparing HUBERT labels from tsv files, the
steps are:
1. feature extraction
2. k-means clustering
3. k-means application
## Data preparation
`*.tsv` files contains a list of audio, where each line is the root, and
following lines are the subpath for each audio:
```
<root-dir>
<audio-path-1>
<audio-path-2>
...
```
## Feature extraction
### MFCC feature
Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D
mfcc+delta+ddelta features for the 1st iteration HUBERT training, run:
```sh
python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir}
```
This would shard the tsv file into `${nshard}` and extract features for the
`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would
be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
### HUBERT feature
To extract features from the `${layer}`-th transformer layer of a trained
HUBERT model saved at `${ckpt_path}`, run:
```sh
python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir}
```
Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
- if out-of-memory, decrease the chunk size with `--max_chunk`
## K-means clustering
To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run
```sh
python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1
```
This saves the k-means model to `${km_path}`.
- set `--precent -1` to use all data
- more kmeans options can be found with `-h` flag
## K-means application
To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run
```sh
python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
```
This would extract labels for the `${rank}`-th shard out of `${nshard}` shards
and dump them to `${lab_dir}/${split}_${rank}_${shard}.km`
Finally, merge shards for `${split}` by running
```sh
for rank in $(seq 0 $((nshard - 1))); do
cat $lab_dir/${split}_${rank}_${nshard}.km
done > $lab_dir/${split}.km
```
## Create a dummy dict
To create a dummy dictionary, run
```sh
for x in $(seq 0 $((n_clusters - 1))); do
echo "$x 1"
done >> $lab_dir/dict.km.txt
```
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import fairseq
import soundfile as sf
import torch
import torch.nn.functional as F
from feature_utils import get_path_iterator, dump_feature
from fairseq.data.audio.audio_utils import get_features_or_waveform
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_hubert_feature")
class HubertFeatureReader(object):
def __init__(self, ckpt_path, layer, max_chunk=1600000):
(
model,
cfg,
task,
) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
self.model = model[0].eval().cuda()
self.task = task
self.layer = layer
self.max_chunk = max_chunk
logger.info(f"TASK CONFIG:\n{self.task.cfg}")
logger.info(f" max_chunk = {self.max_chunk}")
def read_audio(self, path, ref_len=None):
wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
if wav.ndim == 2:
wav = wav.mean(-1)
assert wav.ndim == 1, wav.ndim
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_feats(self, path, ref_len=None):
x = self.read_audio(path, ref_len=ref_len)
with torch.no_grad():
x = torch.from_numpy(x).float().cuda()
if self.task.cfg.normalize:
x = F.layer_norm(x, x.shape)
x = x.view(1, -1)
feat = []
for start in range(0, x.size(1), self.max_chunk):
x_chunk = x[:, start : start + self.max_chunk]
feat_chunk, _ = self.model.extract_features(
source=x_chunk,
padding_mask=None,
mask=False,
output_layer=self.layer,
)
feat.append(feat_chunk)
return torch.cat(feat, 1).squeeze(0)
def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
reader = HubertFeatureReader(ckpt_path, layer, max_chunk)
generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("split")
parser.add_argument("ckpt_path")
parser.add_argument("layer", type=int)
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("--max_chunk", type=int, default=1600000)
args = parser.parse_args()
logger.info(args)
main(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import csv
import io
import logging
import os
import os.path as op
import sys
from dump_hubert_feature import HubertFeatureReader
from feature_utils import get_shard_range, dump_feature
from fairseq.data.audio.audio_utils import get_features_or_waveform
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_hubert_feature_s2t")
class HubertFeatureReaderS2T(HubertFeatureReader):
def read_audio(self, path, ref_len=None):
wav = get_features_or_waveform(
path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate
)
if wav.ndim == 2:
wav = wav.mean(-1)
assert wav.ndim == 1, wav.ndim
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_path_iterator(root, tsv, nshard, rank, audio_col_name):
with open(tsv) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
subpaths = [op.join(root, e[audio_col_name]) for e in reader]
start, end = get_shard_range(len(subpaths), nshard, rank)
subpaths = subpaths[start:end]
def iterate():
for subpath in subpaths:
yield op.join(root, subpath), None
return iterate, len(subpaths)
def main(
root,
tsv_path,
ckpt_path,
layer,
nshard,
rank,
feat_dir,
split,
max_chunk,
audio_col_name,
):
reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk)
generator, num = get_path_iterator(root, tsv_path, nshard, rank, audio_col_name)
dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("root")
parser.add_argument("tsv_path")
parser.add_argument("ckpt_path")
parser.add_argument("layer", type=int)
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("split")
parser.add_argument("--audio_col_name", type=str, default="audio")
parser.add_argument("--max_chunk", type=int, default=1600000)
args = parser.parse_args()
logger.info(args)
main(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import numpy as np
import joblib
import torch
import tqdm
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_km_label")
class ApplyKmeans(object):
def __init__(self, km_path):
self.km_model = joblib.load(km_path)
self.C_np = self.km_model.cluster_centers_.transpose()
self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
self.C = torch.from_numpy(self.C_np)
self.Cnorm = torch.from_numpy(self.Cnorm_np)
if torch.cuda.is_available():
self.C = self.C.cuda()
self.Cnorm = self.Cnorm.cuda()
def __call__(self, x):
if isinstance(x, torch.Tensor):
dist = (
x.pow(2).sum(1, keepdim=True)
- 2 * torch.matmul(x, self.C)
+ self.Cnorm
)
return dist.argmin(dim=1).cpu().numpy()
else:
dist = (
(x ** 2).sum(1, keepdims=True)
- 2 * np.matmul(x, self.C_np)
+ self.Cnorm_np
)
return np.argmin(dist, axis=1)
def get_feat_iterator(feat_dir, split, nshard, rank):
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
with open(leng_path, "r") as f:
lengs = [int(line.rstrip()) for line in f]
offsets = [0] + np.cumsum(lengs[:-1]).tolist()
def iterate():
feat = np.load(feat_path, mmap_mode="r")
assert feat.shape[0] == (offsets[-1] + lengs[-1])
for offset, leng in zip(offsets, lengs):
yield feat[offset: offset + leng]
return iterate, len(lengs)
def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir):
apply_kmeans = ApplyKmeans(km_path)
generator, num = get_feat_iterator(feat_dir, split, nshard, rank)
iterator = generator()
lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km"
os.makedirs(lab_dir, exist_ok=True)
with open(lab_path, "w") as f:
for feat in tqdm.tqdm(iterator, total=num):
# feat = torch.from_numpy(feat).cuda()
lab = apply_kmeans(feat).tolist()
f.write(" ".join(map(str, lab)) + "\n")
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("feat_dir")
parser.add_argument("split")
parser.add_argument("km_path")
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("lab_dir")
args = parser.parse_args()
logging.info(str(args))
dump_label(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import soundfile as sf
import torch
import torchaudio
from feature_utils import get_path_iterator, dump_feature
from fairseq.data.audio.audio_utils import get_features_or_waveform
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_mfcc_feature")
class MfccFeatureReader(object):
def __init__(self, sample_rate):
self.sample_rate = sample_rate
def read_audio(self, path, ref_len=None):
wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.sample_rate)
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_feats(self, path, ref_len=None):
x = self.read_audio(path, ref_len=ref_len)
with torch.no_grad():
x = torch.from_numpy(x).float()
x = x.view(1, -1)
mfccs = torchaudio.compliance.kaldi.mfcc(
waveform=x,
sample_frequency=self.sample_rate,
use_energy=False,
) # (time, freq)
mfccs = mfccs.transpose(0, 1) # (freq, time)
deltas = torchaudio.functional.compute_deltas(mfccs)
ddeltas = torchaudio.functional.compute_deltas(deltas)
concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
concat = concat.transpose(0, 1).contiguous() # (freq, time)
return concat
def main(tsv_dir, split, nshard, rank, feat_dir, sample_rate):
reader = MfccFeatureReader(sample_rate)
generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("split")
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("--sample_rate", type=int, default=16000)
args = parser.parse_args()
logger.info(args)
main(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import fairseq
import soundfile as sf
import torch
import torch.nn.functional as F
from feature_utils import get_path_iterator, dump_feature
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_w2v2_feature")
class Wav2Vec2FeatureReader(object):
def __init__(self, ckpt_path, layer, max_chunk=1600000):
(
model,
cfg,
task,
) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
self.model = model[0].eval().cuda()
self.task = task
self.layer = layer # assume this is 1-based like HuBERT
self.max_chunk = max_chunk
logger.info(f"TASK CONFIG:\n{self.task.cfg}")
logger.info(f" max_chunk = {self.max_chunk}")
logger.info(f" model:\n{self.model}")
def read_audio(self, path, ref_len=None):
wav, sr = sf.read(path)
assert sr == self.task.cfg.sample_rate, sr
if wav.ndim == 2:
wav = wav.mean(-1)
assert wav.ndim == 1, wav.ndim
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_feats(self, path, ref_len=None):
x = self.read_audio(path, ref_len)
with torch.no_grad():
x = torch.from_numpy(x).float().cuda()
if self.task.cfg.normalize:
x = F.layer_norm(x, x.shape)
x = x.view(1, -1)
feat = []
for start in range(0, x.size(1), self.max_chunk):
x_chunk = x[:, start: start + self.max_chunk]
res = self.model.extract_features(
source=x_chunk,
padding_mask=None,
mask=False,
layer=self.layer - 1,
)
feat_chunk = res["x"]
feat.append(feat_chunk)
return torch.cat(feat, 1).squeeze(0)
def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
reader = Wav2Vec2FeatureReader(ckpt_path, layer, max_chunk)
generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("split")
parser.add_argument("ckpt_path")
parser.add_argument("layer", type=int)
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("--max_chunk", type=int, default=1600000)
args = parser.parse_args()
logger.info(args)
main(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import tqdm
from npy_append_array import NpyAppendArray
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("feature_utils")
def get_shard_range(tot, nshard, rank):
assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
start = round(tot / nshard * rank)
end = round(tot / nshard * (rank + 1))
assert start < end, f"start={start}, end={end}"
logger.info(
f"rank {rank} of {nshard}, process {end-start} "
f"({start}-{end}) out of {tot}"
)
return start, end
def get_path_iterator(tsv, nshard, rank):
with open(tsv, "r") as f:
root = f.readline().rstrip()
lines = [line.rstrip() for line in f]
start, end = get_shard_range(len(lines), nshard, rank)
lines = lines[start:end]
def iterate():
for line in lines:
subpath, nsample = line.split("\t")
yield f"{root}/{subpath}", int(nsample)
return iterate, len(lines)
def dump_feature(reader, generator, num, split, nshard, rank, feat_dir):
iterator = generator()
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
os.makedirs(feat_dir, exist_ok=True)
if os.path.exists(feat_path):
os.remove(feat_path)
feat_f = NpyAppendArray(feat_path)
with open(leng_path, "w") as leng_f:
for path, nsample in tqdm.tqdm(iterator, total=num):
feat = reader.get_feats(path, nsample)
feat_f.append(feat.cpu().numpy())
leng_f.write(f"{len(feat)}\n")
logger.info("finished successfully")
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import joblib
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("learn_kmeans")
def get_km_model(
n_clusters,
init,
max_iter,
batch_size,
tol,
max_no_improvement,
n_init,
reassignment_ratio,
):
return MiniBatchKMeans(
n_clusters=n_clusters,
init=init,
max_iter=max_iter,
batch_size=batch_size,
verbose=1,
compute_labels=False,
tol=tol,
max_no_improvement=max_no_improvement,
init_size=None,
n_init=n_init,
reassignment_ratio=reassignment_ratio,
)
def load_feature_shard(feat_dir, split, nshard, rank, percent):
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
with open(leng_path, "r") as f:
lengs = [int(line.rstrip()) for line in f]
offsets = [0] + np.cumsum(lengs[:-1]).tolist()
if percent < 0:
return np.load(feat_path, mmap_mode="r")
else:
nsample = int(np.ceil(len(lengs) * percent))
indices = np.random.choice(len(lengs), nsample, replace=False)
feat = np.load(feat_path, mmap_mode="r")
sampled_feat = np.concatenate(
[feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0
)
logger.info(
(
f"sampled {nsample} utterances, {len(sampled_feat)} frames "
f"from shard {rank}/{nshard}"
)
)
return sampled_feat
def load_feature(feat_dir, split, nshard, seed, percent):
assert percent <= 1.0
feat = np.concatenate(
[
load_feature_shard(feat_dir, split, nshard, r, percent)
for r in range(nshard)
],
axis=0,
)
logging.info(f"loaded feature with dimension {feat.shape}")
return feat
def learn_kmeans(
feat_dir,
split,
nshard,
km_path,
n_clusters,
seed,
percent,
init,
max_iter,
batch_size,
tol,
n_init,
reassignment_ratio,
max_no_improvement,
):
np.random.seed(seed)
feat = load_feature(feat_dir, split, nshard, seed, percent)
km_model = get_km_model(
n_clusters,
init,
max_iter,
batch_size,
tol,
max_no_improvement,
n_init,
reassignment_ratio,
)
km_model.fit(feat)
joblib.dump(km_model, km_path)
inertia = -km_model.score(feat) / len(feat)
logger.info("total intertia: %.5f", inertia)
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("feat_dir", type=str)
parser.add_argument("split", type=str)
parser.add_argument("nshard", type=int)
parser.add_argument("km_path", type=str)
parser.add_argument("n_clusters", type=int)
parser.add_argument("--seed", default=0, type=int)
parser.add_argument(
"--percent", default=-1, type=float, help="sample a subset; -1 for all"
)
parser.add_argument("--init", default="k-means++")
parser.add_argument("--max_iter", default=100, type=int)
parser.add_argument("--batch_size", default=10000, type=int)
parser.add_argument("--tol", default=0.0, type=float)
parser.add_argument("--max_no_improvement", default=100, type=int)
parser.add_argument("--n_init", default=20, type=int)
parser.add_argument("--reassignment_ratio", default=0.0, type=float)
args = parser.parse_args()
logging.info(str(args))
learn_kmeans(**vars(args))
17 17 17 17 296 296 20 20 20 461 461 20 184 20 20 20 184 289 144 445 445 213 213 213 213 252 215 129 401 20 354 180 494 44 416 416 416 192 192 180 180 84 84 84 16 88 88 88 88 319 242 240 348 35 35 117 404 197 226 209 83 55 55 55 322 67 94 199 118 118 118 118 118 118 402 219 219 219 222 222 222 353 59 245 245 251 251 241 241 431 367 367 178 35 35 35 458 192 351 41 324 324 324 252 464 464 139 139 424 424 424 497 497 497 122 90 42 42 147 380 380 499 319 319 319 348 348 33 33 394 90 76 465 74 425 425 386 386 431 319 319 319 319 319 240 203 53 473 34 340 340 340 340 116 64 212 384 377 123 123 123 216 216 216 114 114 57 57 57 203 381 381 117 48 13 47 80 20 80 80 320 7 7 364 345 141 141 141 141 281 281 9 86 221 198 198 22 283 455 236 239 239 107 107 395 286 286 286 468 468 406 406 467 176 176 176 328 200 200 248 464 145 365 365 365 365 330 385 457 77 77 77 54 224 300 334 334 382 304 304 271 186 31 342 342 342 198 22 283 5 38 162 232 232 482 68 26 26 359 359 81 444 213 213 252 143 458 41 324 324 324 422 143 445 445 445 351 180 486 315 315 450 450 450 203 53 473 291 89 116 379 243 478 478 66 482 482 105 105 336 336 354 29 498 498 498 498 396 396 313 37 314 198 22 222 222 222 222 245 129 74 74 437 437 496 496 496 413 94 199 41 41 324 324 318 318 269 342 9 168 106 106 284 426 426 426 426 348 64 76 401 259 108 123 153 153 153 153 372 372 396 313 24 314 90 401 259 445 445 351 351 365 365 365 365 282 282 215 233 233 229 427 20 247 126 126 126 326 326 326 326 326 326 326 101 101 101 149 228 228 20 289 20 7 217 70 65 189 189 151 240 285 300 300 495 406 467 176 135 135 339 248 466 114 222 222 222 313 313 239 384 371 490 490 38 31 54 54 224 494 494 236 129 259 74 190 487 288 288 288 288 374 173 173 280 280 302 302 175 175 69 69 223 130 129 401 75 108 119 295 295 295 295 143 192 192 135 135 135 135 200 200 464 255 255 255 251 251 241 431 235 235 235 348 348 465 192 44 44 236 8 8 354 319 319 383 348 36 310 107 107 395 462 462 8 32 32 32 354 153 153 153 153 153 387 387 387 387 85 207 318 318 318 49 453 9 168 125 125 125 125 125 466 199 44 44 143 129 144 445 351 351 351 486 486 460 285 285 302 302 497 497 122 239 161 161 79 79 499 499 499 265 265 265 85 85 85 299 299 173 352 352 427 229 170 247 15 15 15 15 15 15 193 193 193 17
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment