init v0.10.0

9dcc7a15 · flyingdown · db2b0b79 · 9dcc7a15 · 9dcc7a15 · 9dcc7a15
Commit 9dcc7a15 authored Apr 25, 2022 by flyingdown
20 changed files
--- a/test/torchaudio_unittest/assets/mp3_without_ext
+++ b/test/torchaudio_unittest/assets/mp3_without_ext
--- a/test/torchaudio_unittest/assets/sinewave.wav
+++ b/test/torchaudio_unittest/assets/sinewave.wav
--- a/test/torchaudio_unittest/assets/sox_effect_test_args.jsonl
+++ b/test/torchaudio_unittest/assets/sox_effect_test_args.jsonl
+{"effects": [["allpass", "300", "10"]]}
+{"effects": [["band", "300", "10"]]}
+{"effects": [["bandpass", "300", "10"]]}
+{"effects": [["bandreject", "300", "10"]]}
+{"effects": [["bass", "-10"]]}
+{"effects": [["bend", ".35,180,.25", ".15,740,.53", "0,-520,.3"]]}
+{"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]}
+{"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]}
+{"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]}
+{"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]}
+{"effects": [["channels", "1"]]}
+{"effects": [["channels", "2"]]}
+{"effects": [["channels", "3"]]}
+{"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]}
+{"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]}
+{"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]}
+{"effects": [["contrast", "0"]]}
+{"effects": [["contrast", "25"]]}
+{"effects": [["contrast", "50"]]}
+{"effects": [["contrast", "75"]]}
+{"effects": [["contrast", "100"]]}
+{"effects": [["dcshift", "1.0"]]}
+{"effects": [["dcshift", "-1.0"]]}
+{"effects": [["deemph"]], "input_sample_rate": 44100}
+{"effects": [["delay", "1.5", "+1"]]}
+{"effects": [["dither", "-s"]]}
+{"effects": [["dither", "-S"]]}
+{"effects": [["divide"]]}
+{"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000}
+{"effects": [["earwax"]], "input_sample_rate": 44100}
+{"effects": [["echo", "0.8", "0.88", "60", "0.4"]]}
+{"effects": [["echo", "0.8", "0.88", "6", "0.4"]]}
+{"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]}
+{"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]}
+{"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]}
+{"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]}
+{"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]}
+{"effects": [["equalizer", "300", "10", "5"]]}
+{"effects": [["fade", "q", "3"]]}
+{"effects": [["fade", "h", "3"]]}
+{"effects": [["fade", "t", "3"]]}
+{"effects": [["fade", "l", "3"]]}
+{"effects": [["fade", "p", "3"]]}
+{"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]}
+{"effects": [["fir", "<ASSET_DIR>/sox_effect_test_fir_coeffs.txt"]]}
+{"effects": [["flanger"]]}
+{"effects": [["gain", "-n"]]}
+{"effects": [["gain", "-n", "-3"]]}
+{"effects": [["gain", "-l", "-6"]]}
+{"effects": [["highpass", "-1", "300"]]}
+{"effects": [["highpass", "-2", "300"]]}
+{"effects": [["hilbert"]]}
+{"effects": [["loudness"]]}
+{"effects": [["lowpass", "-1", "300"]]}
+{"effects": [["lowpass", "-2", "300"]]}
+{"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100}
+{"effects": [["norm"]]}
+{"effects": [["oops"]]}
+{"effects": [["overdrive"]]}
+{"effects": [["pad"]]}
+{"effects": [["phaser"]]}
+{"effects": [["pitch", "6.48"], ["rate", "8030"]], "output_sample_rate": 8030}
+{"effects": [["pitch", "-6.50"], ["rate", "7970"]], "output_sample_rate": 7970}
+{"effects": [["rate", "4567"]], "output_sample_rate": 4567}
+{"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8}
+{"effects": [["remix", "1-3,7", "3"]], "num_channels": 8}
+{"effects": [["repeat"]]}
+{"effects": [["reverb"]]}
+{"effects": [["reverse"]]}
+{"effects": [["riaa"]], "input_sample_rate": 44100}
+{"effects": [["silence", "0"]]}
+{"effects": [["sinc", "3k"]]}
+{"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200}
+{"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800}
+{"effects": [["stat"]]}
+{"effects": [["stats"]]}
+{"effects": [["stretch"]]}
+{"effects": [["swap"]]}
+{"effects": [["synth"]]}
+{"effects": [["tempo", "0.9"]]}
+{"effects": [["tempo", "1.1"]]}
+{"effects": [["treble", "3"]]}
+{"effects": [["tremolo", "300", "40"]]}
+{"effects": [["tremolo", "300", "50"]]}
+{"effects": [["trim", "0", "0.1"]]}
+{"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000}
+{"effects": [["vad"]]}
+{"effects": [["vol", "3"]]}
--- a/test/torchaudio_unittest/assets/sox_effect_test_fir_coeffs.txt
+++ b/test/torchaudio_unittest/assets/sox_effect_test_fir_coeffs.txt
+0.0195 -0.082 0.234 0.891 -0.145 0.043
--- a/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.mp3
+++ b/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.mp3
--- a/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav
+++ b/test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav
--- a/test/torchaudio_unittest/assets/vad-go-mono-32000.wav
+++ b/test/torchaudio_unittest/assets/vad-go-mono-32000.wav
--- a/test/torchaudio_unittest/assets/vad-go-stereo-44100.wav
+++ b/test/torchaudio_unittest/assets/vad-go-stereo-44100.wav
--- a/test/torchaudio_unittest/assets/vec_flt.ark
+++ b/test/torchaudio_unittest/assets/vec_flt.ark
--- a/test/torchaudio_unittest/assets/vec_int.ark
+++ b/test/torchaudio_unittest/assets/vec_int.ark
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_hubert_model_config.py
+#!/usr/bin/env python3
+"""Generate the conf JSONs from fairseq pretrained weight file, consumed by unit tests
+Note:
+    The current configuration files were generated on fairseq e47a4c84
+Usage:
+1. Download pretrained parameters from https://github.com/pytorch/fairseq/tree/main/examples/hubert
+2. Run this script and save the resulting JSON configuration in assets directory.
+Example:
+```
+python generate_hubert_model_config.py \
+    --model-file hubert_base_ls960.pt \
+    > hubert_base_ls960.json
+python generate_hubert_model_config.py \
+    --model-file hubert_large_ll60k.pt \
+    > hubert_large_ll60k.json
+python generate_hubert_model_config.py \
+    --model-file hubert_large_ll60k_finetune_ls960.pt \
+    > hubert_large_ll60k_finetune_ls960.json
+python generate_hubert_model_config.py \
+    --model-file hubert_xlarge_ll60k.pt \
+    > hubert_large_ll60k.json
+python generate_hubert_model_config.py \
+    --model-file hubert_xlarge_ll60k_finetune_ls960.pt \
+    > hubert_large_ll60k_finetune_ls960.json
+```
+"""
+import json
+import argparse
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        '--model-file',
+        required=True,
+        help=(
+            'A pt file from '
+            'https://github.com/pytorch/fairseq/tree/main/examples/hubert'
+        )
+    )
+    return parser.parse_args()
+def _load(model_file):
+    import fairseq
+    from omegaconf import OmegaConf
+    models, cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
+    model = models[0]
+    cfg = OmegaConf.to_container(cfg)
+    return model, cfg
+def _main():
+    args = _parse_args()
+    model, cfg = _load(args.model_file)
+    if model.__class__.__name__ == 'HubertModel':
+        cfg['task']['data'] = '/foo/bar'
+        cfg['task']['label_dir'] = None
+        conf = {
+            '_name': 'hubert',
+            'model': cfg['model'],
+            'task': cfg['task'],
+            'num_classes': model.num_classes,
+        }
+    elif model.__class__.__name__ == 'HubertCtc':
+        conf = cfg['model']
+        del conf['w2v_path']
+        keep = ['_name', 'task', 'model']
+        for key in list(k for k in conf['w2v_args'] if k not in keep):
+            del conf['w2v_args'][key]
+        conf['data'] = '/foo/bar/'
+        conf['w2v_args']['task']['data'] = '/foo/bar'
+        conf['w2v_args']['task']['labels'] = []
+        conf['w2v_args']['task']['label_dir'] = '/foo/bar'
+    print(json.dumps(conf, indent=4, sort_keys=True))
+if __name__ == '__main__':
+    _main()
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_wav2vec2_model_config.py
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/generate_wav2vec2_model_config.py
+#!/usr/bin/env python3
+"""Generate the conf JSON from fairseq pretrained weight file, that is consumed by unit tests
+Usage:
+1. Download pretrained parameters from https://github.com/pytorch/fairseq/tree/main/examples/wav2vec
+2. Download the dict from https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt
+   and put it in the same directory as parameter files.
+3. Run this script and save the resulting JSON configuration in assets directory.
+Example:
+```
+# Pretrained
+python generate_wav2vec2_model_config.py \
+    --model-file wav2vec_small.pt \
+    > wav2vec_small.json
+python generate_wav2vec2_model_config.py \
+    --model-file libri960_big.pt \
+    > libri960_big.json
+python generate_wav2vec2_model_config.py \
+    --model-file wav2vec_vox_new.pt \
+    > wav2vec_vox_new.json
+# Fine-tuned
+python generate_wav2vec2_model_config.py \
+    --model-file wav2vec_small_960h.pt \
+    > wav2vec_small_960h.json
+python generate_wav2vec2_model_config.py \
+    --model-file wav2vec_big_960h.pt \
+    > wav2vec_large_960h.json
+python generate_wav2vec2_model_config.py \
+    --model-file wav2vec2_vox_960h_new.pt \
+    > wav2vec_large_lv60_960h.json
+python generate_wav2vec2_model_config.py \
+    --model-file wav2vec_vox_960h_pl.pt \
+    > wav2vec_large_lv60_self_960h.json
+```
+"""
+import os
+import json
+import argparse
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        '--model-file',
+        required=True,
+        help=(
+            'A point file from '
+            'https://github.com/pytorch/fairseq/tree/main/examples/wav2vec'
+        )
+    )
+    parser.add_argument(
+        '--dict-dir',
+        help=(
+            'Directory where `dict.ltr.txt` file is found. '
+            'Default: the directory of the given model.'
+        )
+    )
+    args = parser.parse_args()
+    if args.dict_dir is None:
+        args.dict_dir = os.path.dirname(args.model_file)
+    return args
+def _to_json(conf):
+    import yaml
+    from omegaconf import OmegaConf
+    return yaml.safe_load(OmegaConf.to_yaml(conf))
+def _load(model_file, dict_dir):
+    import fairseq
+    overrides = {'data': dict_dir}
+    _, args, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+        [model_file], arg_overrides=overrides
+    )
+    return _to_json(args['model'])
+def _main():
+    args = _parse_args()
+    conf = _load(args.model_file, args.dict_dir)
+    if conf['_name'] == 'wav2vec_ctc':
+        del conf['data']
+        del conf['w2v_args']['task']['data']
+        conf['w2v_args'] = {
+            key: conf['w2v_args'][key] for key in ['model', 'task']
+        }
+    print(json.dumps(conf, indent=4, sort_keys=True))
+if __name__ == '__main__':
+    _main()
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_base_ls960.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_base_ls960.json
+{
+    "_name": "hubert",
+    "model": {
+        "_name": "hubert",
+        "activation_dropout": 0.0,
+        "activation_fn": "gelu",
+        "attention_dropout": 0.1,
+        "conv_bias": false,
+        "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        "conv_pos": 128,
+        "conv_pos_groups": 16,
+        "dropout": 0.1,
+        "dropout_features": 0.1,
+        "dropout_input": 0.1,
+        "encoder_attention_heads": 12,
+        "encoder_embed_dim": 768,
+        "encoder_ffn_embed_dim": 3072,
+        "encoder_layerdrop": 0.05,
+        "encoder_layers": 12,
+        "extractor_mode": "default",
+        "feature_grad_mult": 0.1,
+        "final_dim": 256,
+        "label_rate": 50,
+        "latent_temp": [
+            2.0,
+            0.5,
+            0.999995
+        ],
+        "layer_norm_first": false,
+        "logit_temp": 0.1,
+        "mask_channel_length": 10,
+        "mask_channel_min_space": 1,
+        "mask_channel_other": 0.0,
+        "mask_channel_prob": 0.0,
+        "mask_channel_selection": "static",
+        "mask_length": 10,
+        "mask_min_space": 1,
+        "mask_other": 0.0,
+        "mask_prob": 0.8,
+        "mask_selection": "static",
+        "no_mask_channel_overlap": false,
+        "no_mask_overlap": false,
+        "skip_masked": false,
+        "skip_nomask": false,
+        "target_glu": false,
+        "untie_final_proj": false
+    },
+    "num_classes": [
+        504
+    ],
+    "task": {
+        "_name": "hubert_pretraining",
+        "data": "/foo/bar",
+        "enable_padding": false,
+        "fine_tuning": false,
+        "label_dir": null,
+        "label_rate": 50,
+        "labels": [
+            "layer6.km500"
+        ],
+        "max_sample_size": 250000,
+        "min_sample_size": 32000,
+        "normalize": false,
+        "pad_audio": false,
+        "random_crop": true,
+        "sample_rate": 16000,
+        "single_target": false
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k.json
+{
+    "_name": "hubert",
+    "model": {
+        "_name": "hubert",
+        "activation_dropout": 0.0,
+        "activation_fn": "gelu",
+        "attention_dropout": 0.0,
+        "conv_bias": false,
+        "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        "conv_pos": 128,
+        "conv_pos_groups": 16,
+        "dropout": 0.0,
+        "dropout_features": 0.0,
+        "dropout_input": 0.0,
+        "encoder_attention_heads": 16,
+        "encoder_embed_dim": 1024,
+        "encoder_ffn_embed_dim": 4096,
+        "encoder_layerdrop": 0.0,
+        "encoder_layers": 24,
+        "extractor_mode": "layer_norm",
+        "feature_grad_mult": 1.0,
+        "final_dim": 768,
+        "label_rate": 50,
+        "latent_temp": [
+            2.0,
+            0.5,
+            0.999995
+        ],
+        "layer_norm_first": true,
+        "logit_temp": 0.1,
+        "mask_channel_length": 10,
+        "mask_channel_min_space": 1,
+        "mask_channel_other": 0.0,
+        "mask_channel_prob": 0.0,
+        "mask_channel_selection": "static",
+        "mask_length": 10,
+        "mask_min_space": 1,
+        "mask_other": 0.0,
+        "mask_prob": 0.8,
+        "mask_selection": "static",
+        "no_mask_channel_overlap": false,
+        "no_mask_overlap": false,
+        "skip_masked": false,
+        "skip_nomask": true,
+        "target_glu": false,
+        "untie_final_proj": true
+    },
+    "num_classes": [
+        504
+    ],
+    "task": {
+        "_name": "hubert_pretraining",
+        "data": "/foo/bar",
+        "enable_padding": false,
+        "label_dir": null,
+        "label_rate": 50,
+        "labels": [
+            "lyr9.km500"
+        ],
+        "max_sample_size": 250000,
+        "min_sample_size": 32000,
+        "normalize": true,
+        "pad_audio": false,
+        "random_crop": true,
+        "sample_rate": 16000,
+        "single_target": false
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k_finetune_ls960.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_large_ll60k_finetune_ls960.json
+{
+    "_name": "hubert_ctc",
+    "activation_dropout": 0.1,
+    "apply_mask": true,
+    "attention_dropout": 0.0,
+    "data": "/foo/bar/",
+    "dropout": 0.0,
+    "dropout_input": 0.0,
+    "feature_grad_mult": 0.0,
+    "final_dropout": 0.0,
+    "freeze_finetune_updates": 10000,
+    "layerdrop": 0.1,
+    "mask_channel_length": 64,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.25,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_other": 0.0,
+    "mask_prob": 0.5,
+    "mask_selection": "static",
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "no_pretrained_weights": false,
+    "normalize": true,
+    "w2v_args": {
+        "_name": null,
+        "model": {
+            "_name": "hubert",
+            "activation_dropout": 0.1,
+            "activation_fn": "gelu",
+            "attention_dropout": 0.0,
+            "conv_bias": false,
+            "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+            "conv_pos": 128,
+            "conv_pos_groups": 16,
+            "dropout": 0.0,
+            "dropout_features": 0.0,
+            "dropout_input": 0.0,
+            "encoder_attention_heads": 16,
+            "encoder_embed_dim": 1024,
+            "encoder_ffn_embed_dim": 4096,
+            "encoder_layerdrop": 0.1,
+            "encoder_layers": 24,
+            "extractor_mode": "layer_norm",
+            "feature_grad_mult": 0.0,
+            "final_dim": 768,
+            "label_rate": 50,
+            "latent_temp": [
+                2.0,
+                0.5,
+                0.999995
+            ],
+            "layer_norm_first": true,
+            "logit_temp": 0.1,
+            "mask_channel_length": 64,
+            "mask_channel_min_space": 1,
+            "mask_channel_other": 0.0,
+            "mask_channel_prob": 0.25,
+            "mask_channel_selection": "static",
+            "mask_length": 10,
+            "mask_min_space": 1,
+            "mask_other": 0.0,
+            "mask_prob": 0.5,
+            "mask_selection": "static",
+            "no_mask_channel_overlap": false,
+            "no_mask_overlap": false,
+            "skip_masked": false,
+            "skip_nomask": true,
+            "target_glu": false,
+            "untie_final_proj": true
+        },
+        "task": {
+            "_name": "hubert_pretraining",
+            "data": "/foo/bar",
+            "enable_padding": false,
+            "fine_tuning": false,
+            "label_dir": "/foo/bar",
+            "label_rate": 50,
+            "labels": [],
+            "max_sample_size": 250000,
+            "min_sample_size": 32000,
+            "normalize": true,
+            "pad_audio": false,
+            "random_crop": true,
+            "sample_rate": 16000,
+            "single_target": false
+        }
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k.json
+{
+    "_name": "hubert",
+    "model": {
+        "_name": "hubert",
+        "activation_dropout": 0.0,
+        "activation_fn": "gelu",
+        "attention_dropout": 0.0,
+        "conv_bias": false,
+        "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        "conv_pos": 128,
+        "conv_pos_groups": 16,
+        "dropout": 0.0,
+        "dropout_features": 0.0,
+        "dropout_input": 0.0,
+        "encoder_attention_heads": 16,
+        "encoder_embed_dim": 1280,
+        "encoder_ffn_embed_dim": 5120,
+        "encoder_layerdrop": 0.0,
+        "encoder_layers": 48,
+        "extractor_mode": "layer_norm",
+        "feature_grad_mult": 1.0,
+        "final_dim": 1024,
+        "label_rate": 50,
+        "latent_temp": [
+            2.0,
+            0.5,
+            0.999995
+        ],
+        "layer_norm_first": true,
+        "logit_temp": 0.1,
+        "mask_channel_length": 10,
+        "mask_channel_min_space": 1,
+        "mask_channel_other": 0.0,
+        "mask_channel_prob": 0.0,
+        "mask_channel_selection": "static",
+        "mask_length": 10,
+        "mask_min_space": 1,
+        "mask_other": 0.0,
+        "mask_prob": 0.8,
+        "mask_selection": "static",
+        "no_mask_channel_overlap": false,
+        "no_mask_overlap": false,
+        "skip_masked": false,
+        "skip_nomask": true,
+        "target_glu": false,
+        "untie_final_proj": true
+    },
+    "num_classes": [
+        504
+    ],
+    "task": {
+        "_name": "hubert_pretraining",
+        "data": "/foo/bar",
+        "enable_padding": false,
+        "label_dir": null,
+        "label_rate": 50,
+        "labels": [
+            "lyr9.km500"
+        ],
+        "max_sample_size": 250000,
+        "min_sample_size": 32000,
+        "normalize": true,
+        "pad_audio": false,
+        "random_crop": true,
+        "sample_rate": 16000,
+        "single_target": false
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k_finetune_ls960.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/hubert_xtralarge_ll60k_finetune_ls960.json
+{
+    "_name": "hubert_ctc",
+    "activation_dropout": 0.1,
+    "apply_mask": true,
+    "attention_dropout": 0.0,
+    "data": "/foo/bar/",
+    "dropout": 0.0,
+    "dropout_input": 0.0,
+    "feature_grad_mult": 0.0,
+    "final_dropout": 0.0,
+    "freeze_finetune_updates": 10000,
+    "layerdrop": 0.1,
+    "mask_channel_length": 64,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.25,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_other": 0.0,
+    "mask_prob": 0.5,
+    "mask_selection": "static",
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "no_pretrained_weights": false,
+    "normalize": true,
+    "w2v_args": {
+        "_name": null,
+        "model": {
+            "_name": "hubert",
+            "activation_dropout": 0.1,
+            "activation_fn": "gelu",
+            "attention_dropout": 0.0,
+            "conv_bias": false,
+            "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+            "conv_pos": 128,
+            "conv_pos_groups": 16,
+            "dropout": 0.0,
+            "dropout_features": 0.0,
+            "dropout_input": 0.0,
+            "encoder_attention_heads": 16,
+            "encoder_embed_dim": 1280,
+            "encoder_ffn_embed_dim": 5120,
+            "encoder_layerdrop": 0.1,
+            "encoder_layers": 48,
+            "extractor_mode": "layer_norm",
+            "feature_grad_mult": 0.0,
+            "final_dim": 1024,
+            "label_rate": 50,
+            "latent_temp": [
+                2.0,
+                0.5,
+                0.999995
+            ],
+            "layer_norm_first": true,
+            "logit_temp": 0.1,
+            "mask_channel_length": 64,
+            "mask_channel_min_space": 1,
+            "mask_channel_other": 0.0,
+            "mask_channel_prob": 0.25,
+            "mask_channel_selection": "static",
+            "mask_length": 10,
+            "mask_min_space": 1,
+            "mask_other": 0.0,
+            "mask_prob": 0.5,
+            "mask_selection": "static",
+            "no_mask_channel_overlap": false,
+            "no_mask_overlap": false,
+            "skip_masked": false,
+            "skip_nomask": true,
+            "target_glu": false,
+            "untie_final_proj": true
+        },
+        "task": {
+            "_name": "hubert_pretraining",
+            "data": "/foo/bar",
+            "enable_padding": false,
+            "fine_tuning": false,
+            "label_dir": "/foo/bar",
+            "label_rate": 50,
+            "labels": [],
+            "max_sample_size": 250000,
+            "min_sample_size": 32000,
+            "normalize": true,
+            "pad_audio": false,
+            "random_crop": true,
+            "sample_rate": 16000,
+            "single_target": false
+        }
+    }
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/libri960_big.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/libri960_big.json
+{
+    "_name": "wav2vec2",
+    "activation_dropout": 0.0,
+    "activation_fn": "gelu",
+    "attention_dropout": 0.1,
+    "codebook_negatives": 0,
+    "conv_bias": false,
+    "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
+    "conv_pos": 128,
+    "conv_pos_groups": 16,
+    "cross_sample_negatives": 0,
+    "dropout": 0.0,
+    "dropout_features": 0.1,
+    "dropout_input": 0.1,
+    "encoder_attention_heads": 16,
+    "encoder_embed_dim": 1024,
+    "encoder_ffn_embed_dim": 4096,
+    "encoder_layerdrop": 0.2,
+    "encoder_layers": 24,
+    "extractor_mode": "default",
+    "feature_grad_mult": 0.1,
+    "final_dim": 768,
+    "latent_dim": 0,
+    "latent_groups": 2,
+    "latent_temp": [
+        2.0,
+        0.5,
+        0.999995
+    ],
+    "latent_vars": 320,
+    "layer_norm_first": false,
+    "logit_temp": 0.1,
+    "mask_channel_before": false,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_min_space": 1,
+    "mask_other": 0.0,
+    "mask_prob": 0.65,
+    "mask_selection": "static",
+    "negatives_from_everywhere": false,
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "num_negatives": 100,
+    "quantize_input": false,
+    "quantize_targets": true,
+    "quantizer_depth": 1,
+    "quantizer_factor": 3,
+    "same_quantizer": false,
+    "target_glu": false
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_large_960h.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_large_960h.json
+{
+    "_name": "wav2vec_ctc",
+    "activation_dropout": 0.1,
+    "apply_mask": true,
+    "attention_dropout": 0.0,
+    "blank_mode": "add",
+    "blank_weight": 0.0,
+    "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+    "dropout": 0.0,
+    "dropout_input": 0.0,
+    "encoder_embed_dim": 512,
+    "feature_grad_mult": 0.0,
+    "final_dropout": 0.0,
+    "freeze_finetune_updates": 10000,
+    "layerdrop": 0.2,
+    "mask_channel_before": false,
+    "mask_channel_length": 64,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.1,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_min_space": 1,
+    "mask_other": 0.0,
+    "mask_prob": 0.5,
+    "mask_selection": "static",
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "no_pretrained_weights": false,
+    "normalize": false,
+    "w2v_args": {
+        "model": {
+            "_name": "wav2vec2",
+            "activation_dropout": 0.0,
+            "activation_fn": "gelu",
+            "attention_dropout": 0.1,
+            "codebook_negatives": 0,
+            "conv_bias": false,
+            "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
+            "conv_pos": 128,
+            "conv_pos_groups": 16,
+            "cross_sample_negatives": 0,
+            "dropout": 0.0,
+            "dropout_features": 0.1,
+            "dropout_input": 0.1,
+            "encoder_attention_heads": 16,
+            "encoder_embed_dim": 1024,
+            "encoder_ffn_embed_dim": 4096,
+            "encoder_layerdrop": 0.2,
+            "encoder_layers": 24,
+            "extractor_mode": "default",
+            "feature_grad_mult": 0.1,
+            "final_dim": 768,
+            "latent_dim": 0,
+            "latent_groups": 2,
+            "latent_temp": [
+                2.0,
+                0.5,
+                0.999995
+            ],
+            "latent_vars": 320,
+            "layer_norm_first": false,
+            "logit_temp": 0.1,
+            "mask_channel_before": false,
+            "mask_channel_length": 10,
+            "mask_channel_min_space": 1,
+            "mask_channel_other": 0.0,
+            "mask_channel_prob": 0.0,
+            "mask_channel_selection": "static",
+            "mask_length": 10,
+            "mask_min_space": 1,
+            "mask_other": 0.0,
+            "mask_prob": 0.65,
+            "mask_selection": "static",
+            "negatives_from_everywhere": false,
+            "no_mask_channel_overlap": false,
+            "no_mask_overlap": false,
+            "num_negatives": 100,
+            "quantize_input": false,
+            "quantize_targets": true,
+            "quantizer_depth": 1,
+            "quantizer_factor": 3,
+            "same_quantizer": false,
+            "target_glu": false
+        },
+        "task": {
+            "_name": "audio_pretraining",
+            "autoregressive": false,
+            "binarized_dataset": false,
+            "enable_padding": false,
+            "eval_wer": false,
+            "eval_wer_config": {
+                "beam": 5,
+                "constraints": null,
+                "decoding_format": null,
+                "diverse_beam_groups": -1,
+                "diverse_beam_strength": 0.5,
+                "diversity_rate": -1.0,
+                "iter_decode_eos_penalty": 0.0,
+                "iter_decode_force_max_iter": false,
+                "iter_decode_max_iter": 10,
+                "iter_decode_with_beam": 1,
+                "iter_decode_with_external_reranker": false,
+                "lenpen": 1.0,
+                "lm_path": null,
+                "lm_weight": 0.0,
+                "match_source_len": false,
+                "max_len_a": 0.0,
+                "max_len_b": 200,
+                "min_len": 1,
+                "nbest": 1,
+                "no_beamable_mm": false,
+                "no_early_stop": false,
+                "no_repeat_ngram_size": 0,
+                "no_seed_provided": false,
+                "prefix_size": 0,
+                "print_alignment": null,
+                "print_step": false,
+                "replace_unk": null,
+                "retain_dropout": false,
+                "retain_dropout_modules": null,
+                "retain_iter_history": false,
+                "sacrebleu": false,
+                "sampling": false,
+                "sampling_topk": -1,
+                "sampling_topp": -1.0,
+                "score_reference": false,
+                "temperature": 1.0,
+                "unkpen": 0.0,
+                "unnormalized": false
+            },
+            "eval_wer_post_process": "letter",
+            "eval_wer_tokenizer": null,
+            "inferred_w2v_config": null,
+            "labels": null,
+            "max_sample_size": 320000,
+            "min_sample_size": 32000,
+            "normalize": false,
+            "num_batch_buckets": 0,
+            "precompute_mask_indices": false,
+            "sample_rate": 16000,
+            "tpu": true
+        }
+    },
+    "w2v_path": "???"
+}
--- a/test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_large_lv60k_960h.json
+++ b/test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_large_lv60k_960h.json
+{
+    "_name": "wav2vec_ctc",
+    "activation_dropout": 0.1,
+    "apply_mask": true,
+    "attention_dropout": 0.0,
+    "blank_mode": "add",
+    "blank_weight": 0.0,
+    "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+    "dropout": 0.0,
+    "dropout_input": 0.0,
+    "encoder_embed_dim": 512,
+    "feature_grad_mult": 0.0,
+    "final_dropout": 0.0,
+    "freeze_finetune_updates": 10000,
+    "layerdrop": 0.1,
+    "mask_channel_before": false,
+    "mask_channel_length": 64,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.25,
+    "mask_channel_selection": "static",
+    "mask_length": 10,
+    "mask_min_space": 1,
+    "mask_other": 0.0,
+    "mask_prob": 0.5,
+    "mask_selection": "static",
+    "no_mask_channel_overlap": false,
+    "no_mask_overlap": false,
+    "no_pretrained_weights": false,
+    "normalize": true,
+    "w2v_args": {
+        "model": {
+            "_name": "wav2vec2",
+            "activation_dropout": 0.0,
+            "activation_fn": "gelu",
+            "attention_dropout": 0.1,
+            "codebook_negatives": 0,
+            "conv_bias": true,
+            "conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
+            "conv_pos": 128,
+            "conv_pos_groups": 16,
+            "cross_sample_negatives": 0,
+            "dropout": 0.0,
+            "dropout_features": 0.1,
+            "dropout_input": 0.1,
+            "encoder_attention_heads": 16,
+            "encoder_embed_dim": 1024,
+            "encoder_ffn_embed_dim": 4096,
+            "encoder_layerdrop": 0.0,
+            "encoder_layers": 24,
+            "extractor_mode": "layer_norm",
+            "feature_grad_mult": 1.0,
+            "final_dim": 768,
+            "latent_dim": 0,
+            "latent_groups": 2,
+            "latent_temp": [
+                2.0,
+                0.1,
+                0.999995
+            ],
+            "latent_vars": 320,
+            "layer_norm_first": true,
+            "logit_temp": 0.1,
+            "mask_channel_before": false,
+            "mask_channel_length": 10,
+            "mask_channel_min_space": 1,
+            "mask_channel_other": 0.0,
+            "mask_channel_prob": 0.0,
+            "mask_channel_selection": "static",
+            "mask_length": 10,
+            "mask_min_space": 1,
+            "mask_other": 0.0,
+            "mask_prob": 0.65,
+            "mask_selection": "static",
+            "negatives_from_everywhere": false,
+            "no_mask_channel_overlap": false,
+            "no_mask_overlap": false,
+            "num_negatives": 100,
+            "quantize_input": false,
+            "quantize_targets": true,
+            "quantizer_depth": 1,
+            "quantizer_factor": 3,
+            "same_quantizer": false,
+            "target_glu": false
+        },
+        "task": {
+            "_name": "audio_pretraining",
+            "autoregressive": false,
+            "binarized_dataset": false,
+            "enable_padding": false,
+            "eval_wer": false,
+            "eval_wer_config": {
+                "beam": 5,
+                "constraints": null,
+                "decoding_format": null,
+                "diverse_beam_groups": -1,
+                "diverse_beam_strength": 0.5,
+                "diversity_rate": -1.0,
+                "iter_decode_eos_penalty": 0.0,
+                "iter_decode_force_max_iter": false,
+                "iter_decode_max_iter": 10,
+                "iter_decode_with_beam": 1,
+                "iter_decode_with_external_reranker": false,
+                "lenpen": 1.0,
+                "lm_path": null,
+                "lm_weight": 0.0,
+                "match_source_len": false,
+                "max_len_a": 0.0,
+                "max_len_b": 200,
+                "min_len": 1,
+                "nbest": 1,
+                "no_beamable_mm": false,
+                "no_early_stop": false,
+                "no_repeat_ngram_size": 0,
+                "no_seed_provided": false,
+                "prefix_size": 0,
+                "print_alignment": null,
+                "print_step": false,
+                "replace_unk": null,
+                "retain_dropout": false,
+                "retain_dropout_modules": null,
+                "retain_iter_history": false,
+                "sacrebleu": false,
+                "sampling": false,
+                "sampling_topk": -1,
+                "sampling_topp": -1.0,
+                "score_reference": false,
+                "temperature": 1.0,
+                "unkpen": 0.0,
+                "unnormalized": false
+            },
+            "eval_wer_post_process": "letter",
+            "eval_wer_tokenizer": null,
+            "inferred_w2v_config": null,
+            "labels": null,
+            "max_sample_size": 320000,
+            "min_sample_size": 32000,
+            "normalize": true,
+            "num_batch_buckets": 0,
+            "precompute_mask_indices": false,
+            "sample_rate": 16000,
+            "tpu": true
+        }
+    },
+    "w2v_path": "???"
+}