Commit 9dcc7a15 authored by flyingdown's avatar flyingdown
Browse files

init v0.10.0

parent db2b0b79
Pipeline #254 failed with stages
in 0 seconds
{"effects": [["allpass", "300", "10"]]}
{"effects": [["band", "300", "10"]]}
{"effects": [["bandpass", "300", "10"]]}
{"effects": [["bandreject", "300", "10"]]}
{"effects": [["bass", "-10"]]}
{"effects": [["bend", ".35,180,.25", ".15,740,.53", "0,-520,.3"]]}
{"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]}
{"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]}
{"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]}
{"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]}
{"effects": [["channels", "1"]]}
{"effects": [["channels", "2"]]}
{"effects": [["channels", "3"]]}
{"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]}
{"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]}
{"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]}
{"effects": [["contrast", "0"]]}
{"effects": [["contrast", "25"]]}
{"effects": [["contrast", "50"]]}
{"effects": [["contrast", "75"]]}
{"effects": [["contrast", "100"]]}
{"effects": [["dcshift", "1.0"]]}
{"effects": [["dcshift", "-1.0"]]}
{"effects": [["deemph"]], "input_sample_rate": 44100}
{"effects": [["delay", "1.5", "+1"]]}
{"effects": [["dither", "-s"]]}
{"effects": [["dither", "-S"]]}
{"effects": [["divide"]]}
{"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000}
{"effects": [["earwax"]], "input_sample_rate": 44100}
{"effects": [["echo", "0.8", "0.88", "60", "0.4"]]}
{"effects": [["echo", "0.8", "0.88", "6", "0.4"]]}
{"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]}
{"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]}
{"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]}
{"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]}
{"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]}
{"effects": [["equalizer", "300", "10", "5"]]}
{"effects": [["fade", "q", "3"]]}
{"effects": [["fade", "h", "3"]]}
{"effects": [["fade", "t", "3"]]}
{"effects": [["fade", "l", "3"]]}
{"effects": [["fade", "p", "3"]]}
{"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]}
{"effects": [["fir", "<ASSET_DIR>/sox_effect_test_fir_coeffs.txt"]]}
{"effects": [["flanger"]]}
{"effects": [["gain", "-n"]]}
{"effects": [["gain", "-n", "-3"]]}
{"effects": [["gain", "-l", "-6"]]}
{"effects": [["highpass", "-1", "300"]]}
{"effects": [["highpass", "-2", "300"]]}
{"effects": [["hilbert"]]}
{"effects": [["loudness"]]}
{"effects": [["lowpass", "-1", "300"]]}
{"effects": [["lowpass", "-2", "300"]]}
{"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100}
{"effects": [["norm"]]}
{"effects": [["oops"]]}
{"effects": [["overdrive"]]}
{"effects": [["pad"]]}
{"effects": [["phaser"]]}
{"effects": [["pitch", "6.48"], ["rate", "8030"]], "output_sample_rate": 8030}
{"effects": [["pitch", "-6.50"], ["rate", "7970"]], "output_sample_rate": 7970}
{"effects": [["rate", "4567"]], "output_sample_rate": 4567}
{"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8}
{"effects": [["remix", "1-3,7", "3"]], "num_channels": 8}
{"effects": [["repeat"]]}
{"effects": [["reverb"]]}
{"effects": [["reverse"]]}
{"effects": [["riaa"]], "input_sample_rate": 44100}
{"effects": [["silence", "0"]]}
{"effects": [["sinc", "3k"]]}
{"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200}
{"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800}
{"effects": [["stat"]]}
{"effects": [["stats"]]}
{"effects": [["stretch"]]}
{"effects": [["swap"]]}
{"effects": [["synth"]]}
{"effects": [["tempo", "0.9"]]}
{"effects": [["tempo", "1.1"]]}
{"effects": [["treble", "3"]]}
{"effects": [["tremolo", "300", "40"]]}
{"effects": [["tremolo", "300", "50"]]}
{"effects": [["trim", "0", "0.1"]]}
{"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000}
{"effects": [["vad"]]}
{"effects": [["vol", "3"]]}
#!/usr/bin/env python3
"""Generate the conf JSONs from fairseq pretrained weight file, consumed by unit tests
Note:
The current configuration files were generated on fairseq e47a4c84
Usage:
1. Download pretrained parameters from https://github.com/pytorch/fairseq/tree/main/examples/hubert
2. Run this script and save the resulting JSON configuration in assets directory.
Example:
```
python generate_hubert_model_config.py \
--model-file hubert_base_ls960.pt \
> hubert_base_ls960.json
python generate_hubert_model_config.py \
--model-file hubert_large_ll60k.pt \
> hubert_large_ll60k.json
python generate_hubert_model_config.py \
--model-file hubert_large_ll60k_finetune_ls960.pt \
> hubert_large_ll60k_finetune_ls960.json
python generate_hubert_model_config.py \
--model-file hubert_xlarge_ll60k.pt \
> hubert_large_ll60k.json
python generate_hubert_model_config.py \
--model-file hubert_xlarge_ll60k_finetune_ls960.pt \
> hubert_large_ll60k_finetune_ls960.json
```
"""
import json
import argparse
def _parse_args():
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
'--model-file',
required=True,
help=(
'A pt file from '
'https://github.com/pytorch/fairseq/tree/main/examples/hubert'
)
)
return parser.parse_args()
def _load(model_file):
import fairseq
from omegaconf import OmegaConf
models, cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_file])
model = models[0]
cfg = OmegaConf.to_container(cfg)
return model, cfg
def _main():
args = _parse_args()
model, cfg = _load(args.model_file)
if model.__class__.__name__ == 'HubertModel':
cfg['task']['data'] = '/foo/bar'
cfg['task']['label_dir'] = None
conf = {
'_name': 'hubert',
'model': cfg['model'],
'task': cfg['task'],
'num_classes': model.num_classes,
}
elif model.__class__.__name__ == 'HubertCtc':
conf = cfg['model']
del conf['w2v_path']
keep = ['_name', 'task', 'model']
for key in list(k for k in conf['w2v_args'] if k not in keep):
del conf['w2v_args'][key]
conf['data'] = '/foo/bar/'
conf['w2v_args']['task']['data'] = '/foo/bar'
conf['w2v_args']['task']['labels'] = []
conf['w2v_args']['task']['label_dir'] = '/foo/bar'
print(json.dumps(conf, indent=4, sort_keys=True))
if __name__ == '__main__':
_main()
#!/usr/bin/env python3
"""Generate the conf JSON from fairseq pretrained weight file, that is consumed by unit tests
Usage:
1. Download pretrained parameters from https://github.com/pytorch/fairseq/tree/main/examples/wav2vec
2. Download the dict from https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt
and put it in the same directory as parameter files.
3. Run this script and save the resulting JSON configuration in assets directory.
Example:
```
# Pretrained
python generate_wav2vec2_model_config.py \
--model-file wav2vec_small.pt \
> wav2vec_small.json
python generate_wav2vec2_model_config.py \
--model-file libri960_big.pt \
> libri960_big.json
python generate_wav2vec2_model_config.py \
--model-file wav2vec_vox_new.pt \
> wav2vec_vox_new.json
# Fine-tuned
python generate_wav2vec2_model_config.py \
--model-file wav2vec_small_960h.pt \
> wav2vec_small_960h.json
python generate_wav2vec2_model_config.py \
--model-file wav2vec_big_960h.pt \
> wav2vec_large_960h.json
python generate_wav2vec2_model_config.py \
--model-file wav2vec2_vox_960h_new.pt \
> wav2vec_large_lv60_960h.json
python generate_wav2vec2_model_config.py \
--model-file wav2vec_vox_960h_pl.pt \
> wav2vec_large_lv60_self_960h.json
```
"""
import os
import json
import argparse
def _parse_args():
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
'--model-file',
required=True,
help=(
'A point file from '
'https://github.com/pytorch/fairseq/tree/main/examples/wav2vec'
)
)
parser.add_argument(
'--dict-dir',
help=(
'Directory where `dict.ltr.txt` file is found. '
'Default: the directory of the given model.'
)
)
args = parser.parse_args()
if args.dict_dir is None:
args.dict_dir = os.path.dirname(args.model_file)
return args
def _to_json(conf):
import yaml
from omegaconf import OmegaConf
return yaml.safe_load(OmegaConf.to_yaml(conf))
def _load(model_file, dict_dir):
import fairseq
overrides = {'data': dict_dir}
_, args, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
[model_file], arg_overrides=overrides
)
return _to_json(args['model'])
def _main():
args = _parse_args()
conf = _load(args.model_file, args.dict_dir)
if conf['_name'] == 'wav2vec_ctc':
del conf['data']
del conf['w2v_args']['task']['data']
conf['w2v_args'] = {
key: conf['w2v_args'][key] for key in ['model', 'task']
}
print(json.dumps(conf, indent=4, sort_keys=True))
if __name__ == '__main__':
_main()
{
"_name": "hubert",
"model": {
"_name": "hubert",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.1,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 12,
"encoder_embed_dim": 768,
"encoder_ffn_embed_dim": 3072,
"encoder_layerdrop": 0.05,
"encoder_layers": 12,
"extractor_mode": "default",
"feature_grad_mult": 0.1,
"final_dim": 256,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": false,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.8,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": false,
"target_glu": false,
"untie_final_proj": false
},
"num_classes": [
504
],
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"fine_tuning": false,
"label_dir": null,
"label_rate": 50,
"labels": [
"layer6.km500"
],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": false,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
{
"_name": "hubert",
"model": {
"_name": "hubert",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 768,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.8,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"num_classes": [
504
],
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"label_dir": null,
"label_rate": 50,
"labels": [
"lyr9.km500"
],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
{
"_name": "hubert_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"data": "/foo/bar/",
"dropout": 0.0,
"dropout_input": 0.0,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.1,
"mask_channel_length": 64,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": true,
"w2v_args": {
"_name": null,
"model": {
"_name": "hubert",
"activation_dropout": 0.1,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.1,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 0.0,
"final_dim": 768,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"fine_tuning": false,
"label_dir": "/foo/bar",
"label_rate": 50,
"labels": [],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
}
{
"_name": "hubert",
"model": {
"_name": "hubert",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1280,
"encoder_ffn_embed_dim": 5120,
"encoder_layerdrop": 0.0,
"encoder_layers": 48,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 1024,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.8,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"num_classes": [
504
],
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"label_dir": null,
"label_rate": 50,
"labels": [
"lyr9.km500"
],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
{
"_name": "hubert_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"data": "/foo/bar/",
"dropout": 0.0,
"dropout_input": 0.0,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.1,
"mask_channel_length": 64,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": true,
"w2v_args": {
"_name": null,
"model": {
"_name": "hubert",
"activation_dropout": 0.1,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1280,
"encoder_ffn_embed_dim": 5120,
"encoder_layerdrop": 0.1,
"encoder_layers": 48,
"extractor_mode": "layer_norm",
"feature_grad_mult": 0.0,
"final_dim": 1024,
"label_rate": 50,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"skip_masked": false,
"skip_nomask": true,
"target_glu": false,
"untie_final_proj": true
},
"task": {
"_name": "hubert_pretraining",
"data": "/foo/bar",
"enable_padding": false,
"fine_tuning": false,
"label_dir": "/foo/bar",
"label_rate": 50,
"labels": [],
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": true,
"pad_audio": false,
"random_crop": true,
"sample_rate": 16000,
"single_target": false
}
}
}
{
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": false,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.0,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.2,
"encoder_layers": 24,
"extractor_mode": "default",
"feature_grad_mult": 0.1,
"final_dim": 768,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.5,
0.999995
],
"latent_vars": 320,
"layer_norm_first": false,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
}
{
"_name": "wav2vec_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"blank_mode": "add",
"blank_weight": 0.0,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
"dropout": 0.0,
"dropout_input": 0.0,
"encoder_embed_dim": 512,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.2,
"mask_channel_before": false,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.1,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": false,
"w2v_args": {
"model": {
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": false,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.0,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.2,
"encoder_layers": 24,
"extractor_mode": "default",
"feature_grad_mult": 0.1,
"final_dim": 768,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.5,
0.999995
],
"latent_vars": 320,
"layer_norm_first": false,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
},
"task": {
"_name": "audio_pretraining",
"autoregressive": false,
"binarized_dataset": false,
"enable_padding": false,
"eval_wer": false,
"eval_wer_config": {
"beam": 5,
"constraints": null,
"decoding_format": null,
"diverse_beam_groups": -1,
"diverse_beam_strength": 0.5,
"diversity_rate": -1.0,
"iter_decode_eos_penalty": 0.0,
"iter_decode_force_max_iter": false,
"iter_decode_max_iter": 10,
"iter_decode_with_beam": 1,
"iter_decode_with_external_reranker": false,
"lenpen": 1.0,
"lm_path": null,
"lm_weight": 0.0,
"match_source_len": false,
"max_len_a": 0.0,
"max_len_b": 200,
"min_len": 1,
"nbest": 1,
"no_beamable_mm": false,
"no_early_stop": false,
"no_repeat_ngram_size": 0,
"no_seed_provided": false,
"prefix_size": 0,
"print_alignment": null,
"print_step": false,
"replace_unk": null,
"retain_dropout": false,
"retain_dropout_modules": null,
"retain_iter_history": false,
"sacrebleu": false,
"sampling": false,
"sampling_topk": -1,
"sampling_topp": -1.0,
"score_reference": false,
"temperature": 1.0,
"unkpen": 0.0,
"unnormalized": false
},
"eval_wer_post_process": "letter",
"eval_wer_tokenizer": null,
"inferred_w2v_config": null,
"labels": null,
"max_sample_size": 320000,
"min_sample_size": 32000,
"normalize": false,
"num_batch_buckets": 0,
"precompute_mask_indices": false,
"sample_rate": 16000,
"tpu": true
}
},
"w2v_path": "???"
}
{
"_name": "wav2vec_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"blank_mode": "add",
"blank_weight": 0.0,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
"dropout": 0.0,
"dropout_input": 0.0,
"encoder_embed_dim": 512,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.1,
"mask_channel_before": false,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.25,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": true,
"w2v_args": {
"model": {
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": true,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.0,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 768,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.1,
0.999995
],
"latent_vars": 320,
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
},
"task": {
"_name": "audio_pretraining",
"autoregressive": false,
"binarized_dataset": false,
"enable_padding": false,
"eval_wer": false,
"eval_wer_config": {
"beam": 5,
"constraints": null,
"decoding_format": null,
"diverse_beam_groups": -1,
"diverse_beam_strength": 0.5,
"diversity_rate": -1.0,
"iter_decode_eos_penalty": 0.0,
"iter_decode_force_max_iter": false,
"iter_decode_max_iter": 10,
"iter_decode_with_beam": 1,
"iter_decode_with_external_reranker": false,
"lenpen": 1.0,
"lm_path": null,
"lm_weight": 0.0,
"match_source_len": false,
"max_len_a": 0.0,
"max_len_b": 200,
"min_len": 1,
"nbest": 1,
"no_beamable_mm": false,
"no_early_stop": false,
"no_repeat_ngram_size": 0,
"no_seed_provided": false,
"prefix_size": 0,
"print_alignment": null,
"print_step": false,
"replace_unk": null,
"retain_dropout": false,
"retain_dropout_modules": null,
"retain_iter_history": false,
"sacrebleu": false,
"sampling": false,
"sampling_topk": -1,
"sampling_topp": -1.0,
"score_reference": false,
"temperature": 1.0,
"unkpen": 0.0,
"unnormalized": false
},
"eval_wer_post_process": "letter",
"eval_wer_tokenizer": null,
"inferred_w2v_config": null,
"labels": null,
"max_sample_size": 320000,
"min_sample_size": 32000,
"normalize": true,
"num_batch_buckets": 0,
"precompute_mask_indices": false,
"sample_rate": 16000,
"tpu": true
}
},
"w2v_path": "???"
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment