v1.0

72f5785f · huaerkl · 72f5785f · 72f5785f · 72f5785f · 72f5785f
Commit 72f5785f authored Aug 15, 2023 by huaerkl
20 changed files
--- a/examples/hubert/config/pretrain/data/iter1.yaml
+++ b/examples/hubert/config/pretrain/data/iter1.yaml
+# @package _global_
+
+task:
+  label_dir: ???
+  labels: ["km"]
+
+model:
+  label_rate: 100
--- a/examples/hubert/config/pretrain/data/iter2.yaml
+++ b/examples/hubert/config/pretrain/data/iter2.yaml
+# @package _global_
+
+task:
+  label_dir: ???
+  labels: ["km"]
+
+model:
+  label_rate: 50
--- a/examples/hubert/config/pretrain/hubert_base_librispeech.yaml
+++ b/examples/hubert/config/pretrain/hubert_base_librispeech.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: false # must be consistent with extractor
+
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+
+criterion:
+  _name: hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: hubert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: default
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/examples/hubert/config/pretrain/hubert_large_librivox.yaml
+++ b/examples/hubert/config/pretrain/hubert_large_librivox.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 128
+  distributed_port: 29671
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: true # must be consistent with extractor
+
+dataset:
+  num_workers: 6
+  max_tokens: 900000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+
+criterion:
+  _name: hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+
+optimization:
+  max_update: 400000
+  lr: [0.0015]
+  clip_norm: 1.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: hubert
+  label_rate: ???
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  final_dim: 768
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: layer_norm
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  encoder_layerdrop: 0.0
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  layer_norm_first: true
+  feature_grad_mult: 1.0
+  untie_final_proj: true
+  activation_dropout: 0.0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+  run:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+  sweep:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml
+++ b/examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 256
+  distributed_port: 29671
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: true # must be consistent with extractor
+
+dataset:
+  num_workers: 6
+  max_tokens: 360000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: 5
+  validate_interval_updates: 10000
+
+criterion:
+  _name: hubert
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+
+optimization:
+  max_update: 400000
+  lr: [0.003]
+  clip_norm: 1.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: hubert
+  label_rate: ???
+  encoder_layers: 48
+  encoder_embed_dim: 1280
+  encoder_ffn_embed_dim: 5120
+  encoder_attention_heads: 16
+  final_dim: 1024
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: layer_norm
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  encoder_layerdrop: 0.0
+  dropout_input: 0.0
+  dropout_features: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  layer_norm_first: true
+  feature_grad_mult: 1.0
+  untie_final_proj: true
+  activation_dropout: 0.0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+  run:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+  sweep:
+    dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/examples/hubert/config/pretrain/run/submitit_reg.yaml
+++ b/examples/hubert/config/pretrain/run/submitit_reg.yaml
+# @package _global_
+
+hydra:
+  launcher:
+    cpus_per_task: 8
+    gpus_per_node: 8
+    tasks_per_node: ${hydra.launcher.gpus_per_node}
+    nodes: 4
+    comment: null
+    mem_gb: 384
+    timeout_min: 4320
+    max_num_timeout: 100
+    constraint: volta32gb
+    name: ${hydra.job.config_name}/${hydra.job.override_dirname}
+    submitit_folder: ${hydra.sweep.dir}/submitit/%j
+
+distributed_training:
+  distributed_world_size: 32
+  distributed_port: 29671
+  nprocs_per_node: 8
--- a/examples/hubert/measure_teacher_quality.py
+++ b/examples/hubert/measure_teacher_quality.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import os.path as op
+import re
+from tabulate import tabulate
+from collections import Counter
+
+
+def comp_purity(p_xy, axis):
+    max_p = p_xy.max(axis=axis)
+    marg_p = p_xy.sum(axis=axis)
+    indv_pur = max_p / marg_p
+    aggr_pur = max_p.sum()
+    return indv_pur, aggr_pur
+
+
+def comp_entropy(p):
+    return (-p * np.log(p + 1e-8)).sum()
+
+
+def comp_norm_mutual_info(p_xy):
+    p_x = p_xy.sum(axis=1, keepdims=True)
+    p_y = p_xy.sum(axis=0, keepdims=True)
+    pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8)
+    mi = (p_xy * pmi).sum()
+    h_x = comp_entropy(p_x)
+    h_y = comp_entropy(p_y)
+    return mi, mi / h_x, mi / h_y, h_x, h_y
+
+
+def pad(labs, n):
+    if n == 0:
+        return np.array(labs)
+    return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n])
+
+
+def comp_avg_seg_dur(labs_list):
+    n_frms = 0
+    n_segs = 0
+    for labs in labs_list:
+        labs = np.array(labs)
+        edges = np.zeros(len(labs)).astype(bool)
+        edges[0] = True
+        edges[1:] = labs[1:] != labs[:-1]
+        n_frms += len(edges)
+        n_segs += edges.astype(int).sum()
+    return n_frms / n_segs
+
+
+def comp_joint_prob(uid2refs, uid2hyps):
+    """
+    Args:
+        pad: padding for spliced-feature derived labels
+    """
+    cnts = Counter()
+    skipped = []
+    abs_frmdiff = 0
+    for uid in uid2refs:
+        if uid not in uid2hyps:
+            skipped.append(uid)
+            continue
+        refs = uid2refs[uid]
+        hyps = uid2hyps[uid]
+        abs_frmdiff += abs(len(refs) - len(hyps))
+        min_len = min(len(refs), len(hyps))
+        refs = refs[:min_len]
+        hyps = hyps[:min_len]
+        cnts.update(zip(refs, hyps))
+    tot = sum(cnts.values())
+
+    ref_set = sorted({ref for ref, _ in cnts.keys()})
+    hyp_set = sorted({hyp for _, hyp in cnts.keys()})
+    ref2pid = dict(zip(ref_set, range(len(ref_set))))
+    hyp2lid = dict(zip(hyp_set, range(len(hyp_set))))
+    # print(hyp_set)
+    p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float)
+    for (ref, hyp), cnt in cnts.items():
+        p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt
+    p_xy /= p_xy.sum()
+    return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped
+
+
+def read_phn(tsv_path, rm_stress=True):
+    uid2phns = {}
+    with open(tsv_path) as f:
+        for line in f:
+            uid, phns = line.rstrip().split("\t")
+            phns = phns.split(",")
+            if rm_stress:
+                phns = [re.sub("[0-9]", "", phn) for phn in phns]
+            uid2phns[uid] = phns
+    return uid2phns
+
+
+def read_lab(tsv_path, lab_path, pad_len=0, upsample=1):
+    """
+    tsv is needed to retrieve the uids for the labels
+    """
+    with open(tsv_path) as f:
+        f.readline()
+        uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f]
+    with open(lab_path) as f:
+        labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f]
+    assert len(uids) == len(labs_list)
+    return dict(zip(uids, labs_list))
+
+
+def main_lab_lab(
+    tsv_dir,
+    lab_dir,
+    lab_name,
+    lab_sets,
+    ref_dir,
+    ref_name,
+    pad_len=0,
+    upsample=1,
+    verbose=False,
+):
+    # assume tsv_dir is the same for both the reference and the hypotheses
+    tsv_dir = lab_dir if tsv_dir is None else tsv_dir
+
+    uid2refs = {}
+    for s in lab_sets:
+        uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}"))
+
+    uid2hyps = {}
+    for s in lab_sets:
+        uid2hyps.update(
+            read_lab(
+                f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
+            )
+        )
+    _main(uid2refs, uid2hyps, verbose)
+
+
+def main_phn_lab(
+    tsv_dir,
+    lab_dir,
+    lab_name,
+    lab_sets,
+    phn_dir,
+    phn_sets,
+    pad_len=0,
+    upsample=1,
+    verbose=False,
+):
+    uid2refs = {}
+    for s in phn_sets:
+        uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv"))
+
+    uid2hyps = {}
+    tsv_dir = lab_dir if tsv_dir is None else tsv_dir
+    for s in lab_sets:
+        uid2hyps.update(
+            read_lab(
+                f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
+            )
+        )
+    _main(uid2refs, uid2hyps, verbose)
+
+
+def _main(uid2refs, uid2hyps, verbose):
+    (p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob(
+        uid2refs, uid2hyps
+    )
+    ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0)
+    hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1)
+    (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy)
+    outputs = {
+        "ref pur": ref_pur,
+        "hyp pur": hyp_pur,
+        "H(ref)": h_ref,
+        "H(hyp)": h_hyp,
+        "MI": mi,
+        "MI/H(ref)": mi_norm_by_ref,
+        "ref segL": comp_avg_seg_dur(uid2refs.values()),
+        "hyp segL": comp_avg_seg_dur(uid2hyps.values()),
+        "p_xy shape": p_xy.shape,
+        "frm tot": tot,
+        "frm diff": frmdiff,
+        "utt tot": len(uid2refs),
+        "utt miss": len(skipped),
+    }
+    print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f"))
+
+
+if __name__ == "__main__":
+    """
+    compute quality of labels with respect to phone or another labels if set
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("lab_dir")
+    parser.add_argument("lab_name")
+    parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+")
+    parser.add_argument(
+        "--phn_dir",
+        default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1",
+    )
+    parser.add_argument(
+        "--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+"
+    )
+    parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses")
+    parser.add_argument(
+        "--upsample", default=1, type=int, help="upsample factor for hypotheses"
+    )
+    parser.add_argument("--ref_lab_dir", default="")
+    parser.add_argument("--ref_lab_name", default="")
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    if args.ref_lab_dir and args.ref_lab_name:
+        main_lab_lab(
+            args.tsv_dir,
+            args.lab_dir,
+            args.lab_name,
+            args.lab_sets,
+            args.ref_lab_dir,
+            args.ref_lab_name,
+            args.pad_len,
+            args.upsample,
+            args.verbose,
+        )
+    else:
+        main_phn_lab(
+            args.tsv_dir,
+            args.lab_dir,
+            args.lab_name,
+            args.lab_sets,
+            args.phn_dir,
+            args.phn_sets,
+            args.pad_len,
+            args.upsample,
+            args.verbose,
+        )
--- a/examples/hubert/simple_kmeans/README.md
+++ b/examples/hubert/simple_kmeans/README.md
+# Sharded Feature Extraction and K-means Application
+
+This folder contains scripts for preparing HUBERT labels from tsv files, the
+steps are:
+1. feature extraction
+2. k-means clustering
+3. k-means application
+
+
+## Data preparation
+
+`*.tsv` files contains a list of audio, where each line is the root, and
+following lines are the subpath for each audio:
+```
+<root-dir>
+<audio-path-1>
+<audio-path-2>
+...
+```
+
+
+## Feature extraction
+
+### MFCC feature
+Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D
+mfcc+delta+ddelta features for the 1st iteration HUBERT training, run:
+```sh
+python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir}
+```
+This would shard the tsv file into `${nshard}` and extract features for the
+`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would
+be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+
+
+### HUBERT feature
+To extract features from the `${layer}`-th transformer layer of a trained
+HUBERT model saved at `${ckpt_path}`, run:
+```sh
+python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir}
+```
+Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
+
+- if out-of-memory, decrease the chunk size with `--max_chunk`
+
+
+## K-means clustering
+To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run
+```sh
+python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1
+```
+This saves the k-means model to `${km_path}`.
+
+- set `--precent -1` to use all data
+- more kmeans options can be found with `-h` flag
+
+
+## K-means application
+To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run
+```sh
+python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
+```
+This would extract labels for the `${rank}`-th shard out of `${nshard}` shards
+and dump them to `${lab_dir}/${split}_${rank}_${shard}.km`
+
+
+Finally, merge shards for `${split}` by running
+```sh
+for rank in $(seq 0 $((nshard - 1))); do
+  cat $lab_dir/${split}_${rank}_${nshard}.km
+done > $lab_dir/${split}.km
+```
+
+
+## Create a dummy dict
+To create a dummy dictionary, run
+```sh
+for x in $(seq 0 $((n_clusters - 1))); do
+  echo "$x 1"
+done >> $lab_dir/dict.km.txt
+```
--- a/examples/hubert/simple_kmeans/dump_hubert_feature.py
+++ b/examples/hubert/simple_kmeans/dump_hubert_feature.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+
+from feature_utils import get_path_iterator, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature")
+
+
+class HubertFeatureReader(object):
+    def __init__(self, ckpt_path, layer, max_chunk=1600000):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.model = model[0].eval().cuda()
+        self.task = task
+        self.layer = layer
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().cuda()
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start : start + self.max_chunk]
+                feat_chunk, _ = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    output_layer=self.layer,
+                )
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)
+
+
+def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
+    reader = HubertFeatureReader(ckpt_path, layer, max_chunk)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
--- a/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py
+++ b/examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+import io
+import logging
+import os
+import os.path as op
+import sys
+
+from dump_hubert_feature import HubertFeatureReader
+from feature_utils import get_shard_range, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_hubert_feature_s2t")
+
+
+class HubertFeatureReaderS2T(HubertFeatureReader):
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(
+            path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate
+        )
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+
+def get_path_iterator(root, tsv, nshard, rank, audio_col_name):
+    with open(tsv) as f:
+        reader = csv.DictReader(
+            f,
+            delimiter="\t",
+            quotechar=None,
+            doublequote=False,
+            lineterminator="\n",
+            quoting=csv.QUOTE_NONE,
+        )
+        subpaths = [op.join(root, e[audio_col_name]) for e in reader]
+        start, end = get_shard_range(len(subpaths), nshard, rank)
+        subpaths = subpaths[start:end]
+
+        def iterate():
+            for subpath in subpaths:
+                yield op.join(root, subpath), None
+
+    return iterate, len(subpaths)
+
+
+def main(
+    root,
+    tsv_path,
+    ckpt_path,
+    layer,
+    nshard,
+    rank,
+    feat_dir,
+    split,
+    max_chunk,
+    audio_col_name,
+):
+    reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk)
+    generator, num = get_path_iterator(root, tsv_path, nshard, rank, audio_col_name)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("root")
+    parser.add_argument("tsv_path")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("split")
+    parser.add_argument("--audio_col_name", type=str, default="audio")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
--- a/examples/hubert/simple_kmeans/dump_km_label.py
+++ b/examples/hubert/simple_kmeans/dump_km_label.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import numpy as np
+
+import joblib
+import torch
+import tqdm
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_km_label")
+
+
+class ApplyKmeans(object):
+    def __init__(self, km_path):
+        self.km_model = joblib.load(km_path)
+        self.C_np = self.km_model.cluster_centers_.transpose()
+        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
+
+        self.C = torch.from_numpy(self.C_np)
+        self.Cnorm = torch.from_numpy(self.Cnorm_np)
+        if torch.cuda.is_available():
+            self.C = self.C.cuda()
+            self.Cnorm = self.Cnorm.cuda()
+
+    def __call__(self, x):
+        if isinstance(x, torch.Tensor):
+            dist = (
+                x.pow(2).sum(1, keepdim=True)
+                - 2 * torch.matmul(x, self.C)
+                + self.Cnorm
+            )
+            return dist.argmin(dim=1).cpu().numpy()
+        else:
+            dist = (
+                (x ** 2).sum(1, keepdims=True)
+                - 2 * np.matmul(x, self.C_np)
+                + self.Cnorm_np
+            )
+            return np.argmin(dist, axis=1)
+
+
+def get_feat_iterator(feat_dir, split, nshard, rank):
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    with open(leng_path, "r") as f:
+        lengs = [int(line.rstrip()) for line in f]
+        offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+
+    def iterate():
+        feat = np.load(feat_path, mmap_mode="r")
+        assert feat.shape[0] == (offsets[-1] + lengs[-1])
+        for offset, leng in zip(offsets, lengs):
+            yield feat[offset: offset + leng]
+
+    return iterate, len(lengs)
+
+
+def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir):
+    apply_kmeans = ApplyKmeans(km_path)
+    generator, num = get_feat_iterator(feat_dir, split, nshard, rank)
+    iterator = generator()
+
+    lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km"
+    os.makedirs(lab_dir, exist_ok=True)
+    with open(lab_path, "w") as f:
+        for feat in tqdm.tqdm(iterator, total=num):
+            # feat = torch.from_numpy(feat).cuda()
+            lab = apply_kmeans(feat).tolist()
+            f.write(" ".join(map(str, lab)) + "\n")
+    logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("feat_dir")
+    parser.add_argument("split")
+    parser.add_argument("km_path")
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("lab_dir")
+    args = parser.parse_args()
+    logging.info(str(args))
+
+    dump_label(**vars(args))
--- a/examples/hubert/simple_kmeans/dump_mfcc_feature.py
+++ b/examples/hubert/simple_kmeans/dump_mfcc_feature.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import soundfile as sf
+import torch
+import torchaudio
+
+from feature_utils import get_path_iterator, dump_feature
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_mfcc_feature")
+
+
+class MfccFeatureReader(object):
+    def __init__(self, sample_rate):
+        self.sample_rate = sample_rate
+
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.sample_rate)
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float()
+            x = x.view(1, -1)
+
+            mfccs = torchaudio.compliance.kaldi.mfcc(
+                waveform=x,
+                sample_frequency=self.sample_rate,
+                use_energy=False,
+            )  # (time, freq)
+            mfccs = mfccs.transpose(0, 1)  # (freq, time)
+            deltas = torchaudio.functional.compute_deltas(mfccs)
+            ddeltas = torchaudio.functional.compute_deltas(deltas)
+            concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
+            concat = concat.transpose(0, 1).contiguous()  # (freq, time)
+            return concat
+
+
+def main(tsv_dir, split, nshard, rank, feat_dir, sample_rate):
+    reader = MfccFeatureReader(sample_rate)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--sample_rate", type=int, default=16000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
--- a/examples/hubert/simple_kmeans/dump_w2v2_feature.py
+++ b/examples/hubert/simple_kmeans/dump_w2v2_feature.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import fairseq
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+
+from feature_utils import get_path_iterator, dump_feature
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_w2v2_feature")
+
+
+class Wav2Vec2FeatureReader(object):
+    def __init__(self, ckpt_path, layer, max_chunk=1600000):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.model = model[0].eval().cuda()
+        self.task = task
+        self.layer = layer  # assume this is 1-based like HuBERT
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+        logger.info(f" model:\n{self.model}")
+
+    def read_audio(self, path, ref_len=None):
+        wav, sr = sf.read(path)
+        assert sr == self.task.cfg.sample_rate, sr
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().cuda()
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer - 1,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)
+
+
+def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk):
+    reader = Wav2Vec2FeatureReader(ckpt_path, layer, max_chunk)
+    generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
+    dump_feature(reader, generator, num, split, nshard, rank, feat_dir)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tsv_dir")
+    parser.add_argument("split")
+    parser.add_argument("ckpt_path")
+    parser.add_argument("layer", type=int)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("rank", type=int)
+    parser.add_argument("feat_dir")
+    parser.add_argument("--max_chunk", type=int, default=1600000)
+    args = parser.parse_args()
+    logger.info(args)
+
+    main(**vars(args))
--- a/examples/hubert/simple_kmeans/feature_utils.py
+++ b/examples/hubert/simple_kmeans/feature_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import tqdm
+from npy_append_array import NpyAppendArray
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("feature_utils")
+
+
+def get_shard_range(tot, nshard, rank):
+    assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
+    start = round(tot / nshard * rank)
+    end = round(tot / nshard * (rank + 1))
+    assert start < end, f"start={start}, end={end}"
+    logger.info(
+        f"rank {rank} of {nshard}, process {end-start} "
+        f"({start}-{end}) out of {tot}"
+    )
+    return start, end
+
+
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        start, end = get_shard_range(len(lines), nshard, rank)
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                subpath, nsample = line.split("\t")
+                yield f"{root}/{subpath}", int(nsample)
+    return iterate, len(lines)
+
+
+def dump_feature(reader, generator, num, split, nshard, rank, feat_dir):
+    iterator = generator()
+
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")
+
+
--- a/examples/hubert/simple_kmeans/learn_kmeans.py
+++ b/examples/hubert/simple_kmeans/learn_kmeans.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+import numpy as np
+from sklearn.cluster import MiniBatchKMeans
+
+import joblib
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("learn_kmeans")
+
+
+def get_km_model(
+    n_clusters,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    max_no_improvement,
+    n_init,
+    reassignment_ratio,
+):
+    return MiniBatchKMeans(
+        n_clusters=n_clusters,
+        init=init,
+        max_iter=max_iter,
+        batch_size=batch_size,
+        verbose=1,
+        compute_labels=False,
+        tol=tol,
+        max_no_improvement=max_no_improvement,
+        init_size=None,
+        n_init=n_init,
+        reassignment_ratio=reassignment_ratio,
+    )
+
+
+def load_feature_shard(feat_dir, split, nshard, rank, percent):
+    feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
+    with open(leng_path, "r") as f:
+        lengs = [int(line.rstrip()) for line in f]
+        offsets = [0] + np.cumsum(lengs[:-1]).tolist()
+
+    if percent < 0:
+        return np.load(feat_path, mmap_mode="r")
+    else:
+        nsample = int(np.ceil(len(lengs) * percent))
+        indices = np.random.choice(len(lengs), nsample, replace=False)
+        feat = np.load(feat_path, mmap_mode="r")
+        sampled_feat = np.concatenate(
+            [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0
+        )
+        logger.info(
+            (
+                f"sampled {nsample} utterances, {len(sampled_feat)} frames "
+                f"from shard {rank}/{nshard}"
+            )
+        )
+        return sampled_feat
+
+
+def load_feature(feat_dir, split, nshard, seed, percent):
+    assert percent <= 1.0
+    feat = np.concatenate(
+        [
+            load_feature_shard(feat_dir, split, nshard, r, percent)
+            for r in range(nshard)
+        ],
+        axis=0,
+    )
+    logging.info(f"loaded feature with dimension {feat.shape}")
+    return feat
+
+
+def learn_kmeans(
+    feat_dir,
+    split,
+    nshard,
+    km_path,
+    n_clusters,
+    seed,
+    percent,
+    init,
+    max_iter,
+    batch_size,
+    tol,
+    n_init,
+    reassignment_ratio,
+    max_no_improvement,
+):
+    np.random.seed(seed)
+    feat = load_feature(feat_dir, split, nshard, seed, percent)
+    km_model = get_km_model(
+        n_clusters,
+        init,
+        max_iter,
+        batch_size,
+        tol,
+        max_no_improvement,
+        n_init,
+        reassignment_ratio,
+    )
+    km_model.fit(feat)
+    joblib.dump(km_model, km_path)
+
+    inertia = -km_model.score(feat) / len(feat)
+    logger.info("total intertia: %.5f", inertia)
+    logger.info("finished successfully")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("feat_dir", type=str)
+    parser.add_argument("split", type=str)
+    parser.add_argument("nshard", type=int)
+    parser.add_argument("km_path", type=str)
+    parser.add_argument("n_clusters", type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument(
+        "--percent", default=-1, type=float, help="sample a subset; -1 for all"
+    )
+    parser.add_argument("--init", default="k-means++")
+    parser.add_argument("--max_iter", default=100, type=int)
+    parser.add_argument("--batch_size", default=10000, type=int)
+    parser.add_argument("--tol", default=0.0, type=float)
+    parser.add_argument("--max_no_improvement", default=100, type=int)
+    parser.add_argument("--n_init", default=20, type=int)
+    parser.add_argument("--reassignment_ratio", default=0.0, type=float)
+    args = parser.parse_args()
+    logging.info(str(args))
+
+    learn_kmeans(**vars(args))
--- a/examples/hubert/tests/6313-76958-0021.flac
+++ b/examples/hubert/tests/6313-76958-0021.flac
--- a/examples/hubert/tests/sample.base.L9.km500.km
+++ b/examples/hubert/tests/sample.base.L9.km500.km
+17 17 17 17 296 296 20 20 20 461 461 20 184 20 20 20 184 289 144 445 445 213 213 213 213 252 215 129 401 20 354 180 494 44 416 416 416 192 192 180 180 84 84 84 16 88 88 88 88 319 242 240 348 35 35 117 404 197 226 209 83 55 55 55 322 67 94 199 118 118 118 118 118 118 402 219 219 219 222 222 222 353 59 245 245 251 251 241 241 431 367 367 178 35 35 35 458 192 351 41 324 324 324 252 464 464 139 139 424 424 424 497 497 497 122 90 42 42 147 380 380 499 319 319 319 348 348 33 33 394 90 76 465 74 425 425 386 386 431 319 319 319 319 319 240 203 53 473 34 340 340 340 340 116 64 212 384 377 123 123 123 216 216 216 114 114 57 57 57 203 381 381 117 48 13 47 80 20 80 80 320 7 7 364 345 141 141 141 141 281 281 9 86 221 198 198 22 283 455 236 239 239 107 107 395 286 286 286 468 468 406 406 467 176 176 176 328 200 200 248 464 145 365 365 365 365 330 385 457 77 77 77 54 224 300 334 334 382 304 304 271 186 31 342 342 342 198 22 283 5 38 162 232 232 482 68 26 26 359 359 81 444 213 213 252 143 458 41 324 324 324 422 143 445 445 445 351 180 486 315 315 450 450 450 203 53 473 291 89 116 379 243 478 478 66 482 482 105 105 336 336 354 29 498 498 498 498 396 396 313 37 314 198 22 222 222 222 222 245 129 74 74 437 437 496 496 496 413 94 199 41 41 324 324 318 318 269 342 9 168 106 106 284 426 426 426 426 348 64 76 401 259 108 123 153 153 153 153 372 372 396 313 24 314 90 401 259 445 445 351 351 365 365 365 365 282 282 215 233 233 229 427 20 247 126 126 126 326 326 326 326 326 326 326 101 101 101 149 228 228 20 289 20 7 217 70 65 189 189 151 240 285 300 300 495 406 467 176 135 135 339 248 466 114 222 222 222 313 313 239 384 371 490 490 38 31 54 54 224 494 494 236 129 259 74 190 487 288 288 288 288 374 173 173 280 280 302 302 175 175 69 69 223 130 129 401 75 108 119 295 295 295 295 143 192 192 135 135 135 135 200 200 464 255 255 255 251 251 241 431 235 235 235 348 348 465 192 44 44 236 8 8 354 319 319 383 348 36 310 107 107 395 462 462 8 32 32 32 354 153 153 153 153 153 387 387 387 387 85 207 318 318 318 49 453 9 168 125 125 125 125 125 466 199 44 44 143 129 144 445 351 351 351 486 486 460 285 285 302 302 497 497 122 239 161 161 79 79 499 499 499 265 265 265 85 85 85 299 299 173 352 352 427 229 170 247 15 15 15 15 15 15 193 193 193 17
--- a/examples/hubert/tests/sample.base.L9.len
+++ b/examples/hubert/tests/sample.base.L9.len
+596
--- a/examples/hubert/tests/sample.base.L9.npy
+++ b/examples/hubert/tests/sample.base.L9.npy
--- a/examples/hubert/tests/sample.large.L20.len
+++ b/examples/hubert/tests/sample.large.L20.len
+596