Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
9dcc7a15
Commit
9dcc7a15
authored
Apr 25, 2022
by
flyingdown
Browse files
init v0.10.0
parent
db2b0b79
Pipeline
#254
failed with stages
in 0 seconds
Changes
416
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1266 additions
and
0 deletions
+1266
-0
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_large_lv60k_self_960h.json
...ssets/wav2vec2/fairseq/wav2vec_large_lv60k_self_960h.json
+146
-0
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_small.json
...audio_unittest/assets/wav2vec2/fairseq/wav2vec_small.json
+54
-0
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_small_960h.json
..._unittest/assets/wav2vec2/fairseq/wav2vec_small_960h.json
+146
-0
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_vox_new.json
...dio_unittest/assets/wav2vec2/fairseq/wav2vec_vox_new.json
+54
-0
test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_53_56k.json
...chaudio_unittest/assets/wav2vec2/fairseq/xlsr_53_56k.json
+51
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-base-10k-voxpopuli.json
...ec2/huggingface/facebook/wav2vec2-base-10k-voxpopuli.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-base-960h.json
...ets/wav2vec2/huggingface/facebook/wav2vec2-base-960h.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-base.json
...t/assets/wav2vec2/huggingface/facebook/wav2vec2-base.json
+77
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-960h-lv60-self.json
...2/huggingface/facebook/wav2vec2-large-960h-lv60-self.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-960h-lv60.json
...v2vec2/huggingface/facebook/wav2vec2-large-960h-lv60.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-960h.json
...ts/wav2vec2/huggingface/facebook/wav2vec2-large-960h.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-lv60.json
...ts/wav2vec2/huggingface/facebook/wav2vec2-large-lv60.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-xlsr-53-german.json
...2/huggingface/facebook/wav2vec2-large-xlsr-53-german.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-xlsr-53.json
...wav2vec2/huggingface/facebook/wav2vec2-large-xlsr-53.json
+75
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large.json
.../assets/wav2vec2/huggingface/facebook/wav2vec2-large.json
+68
-0
test/torchaudio_unittest/assets/wav2vec2/huggingface/generate_huggingface_model_config.py
...wav2vec2/huggingface/generate_huggingface_model_config.py
+37
-0
test/torchaudio_unittest/backend/__init__.py
test/torchaudio_unittest/backend/__init__.py
+0
-0
test/torchaudio_unittest/backend/common.py
test/torchaudio_unittest/backend/common.py
+25
-0
test/torchaudio_unittest/backend/soundfile/__init__.py
test/torchaudio_unittest/backend/soundfile/__init__.py
+0
-0
test/torchaudio_unittest/backend/soundfile/common.py
test/torchaudio_unittest/backend/soundfile/common.py
+57
-0
No files found.
Too many changes to show.
To preserve performance only
416 of 416+
files are displayed.
Plain diff
Email patch
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_large_lv60k_self_960h.json
0 → 100644
View file @
9dcc7a15
{
"_name"
:
"wav2vec_ctc"
,
"activation_dropout"
:
0.1
,
"apply_mask"
:
true
,
"attention_dropout"
:
0.0
,
"blank_mode"
:
"add"
,
"blank_weight"
:
0.0
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]"
,
"dropout"
:
0.0
,
"dropout_input"
:
0.0
,
"encoder_embed_dim"
:
768
,
"feature_grad_mult"
:
0.0
,
"final_dropout"
:
0.0
,
"freeze_finetune_updates"
:
10000
,
"layerdrop"
:
0.1
,
"mask_channel_before"
:
false
,
"mask_channel_length"
:
64
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.1
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.1
,
"mask_selection"
:
"static"
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"no_pretrained_weights"
:
false
,
"normalize"
:
true
,
"w2v_args"
:
{
"model"
:
{
"_name"
:
"wav2vec2"
,
"activation_dropout"
:
0.0
,
"activation_fn"
:
"gelu"
,
"attention_dropout"
:
0.1
,
"codebook_negatives"
:
0
,
"conv_bias"
:
true
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2"
,
"conv_pos"
:
128
,
"conv_pos_groups"
:
16
,
"cross_sample_negatives"
:
0
,
"dropout"
:
0.0
,
"dropout_features"
:
0.1
,
"dropout_input"
:
0.1
,
"encoder_attention_heads"
:
16
,
"encoder_embed_dim"
:
1024
,
"encoder_ffn_embed_dim"
:
4096
,
"encoder_layerdrop"
:
0.0
,
"encoder_layers"
:
24
,
"extractor_mode"
:
"layer_norm"
,
"feature_grad_mult"
:
1.0
,
"final_dim"
:
768
,
"latent_dim"
:
0
,
"latent_groups"
:
2
,
"latent_temp"
:
[
2.0
,
0.1
,
0.999995
],
"latent_vars"
:
320
,
"layer_norm_first"
:
true
,
"logit_temp"
:
0.1
,
"mask_channel_before"
:
false
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.65
,
"mask_selection"
:
"static"
,
"negatives_from_everywhere"
:
false
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"num_negatives"
:
100
,
"quantize_input"
:
false
,
"quantize_targets"
:
true
,
"quantizer_depth"
:
1
,
"quantizer_factor"
:
3
,
"same_quantizer"
:
false
,
"target_glu"
:
false
},
"task"
:
{
"_name"
:
"audio_pretraining"
,
"autoregressive"
:
false
,
"binarized_dataset"
:
false
,
"enable_padding"
:
false
,
"eval_wer"
:
false
,
"eval_wer_config"
:
{
"beam"
:
5
,
"constraints"
:
null
,
"decoding_format"
:
null
,
"diverse_beam_groups"
:
-1
,
"diverse_beam_strength"
:
0.5
,
"diversity_rate"
:
-1.0
,
"iter_decode_eos_penalty"
:
0.0
,
"iter_decode_force_max_iter"
:
false
,
"iter_decode_max_iter"
:
10
,
"iter_decode_with_beam"
:
1
,
"iter_decode_with_external_reranker"
:
false
,
"lenpen"
:
1.0
,
"lm_path"
:
null
,
"lm_weight"
:
0.0
,
"match_source_len"
:
false
,
"max_len_a"
:
0.0
,
"max_len_b"
:
200
,
"min_len"
:
1
,
"nbest"
:
1
,
"no_beamable_mm"
:
false
,
"no_early_stop"
:
false
,
"no_repeat_ngram_size"
:
0
,
"no_seed_provided"
:
false
,
"prefix_size"
:
0
,
"print_alignment"
:
null
,
"print_step"
:
false
,
"replace_unk"
:
null
,
"retain_dropout"
:
false
,
"retain_dropout_modules"
:
null
,
"retain_iter_history"
:
false
,
"sacrebleu"
:
false
,
"sampling"
:
false
,
"sampling_topk"
:
-1
,
"sampling_topp"
:
-1.0
,
"score_reference"
:
false
,
"temperature"
:
1.0
,
"unkpen"
:
0.0
,
"unnormalized"
:
false
},
"eval_wer_post_process"
:
"letter"
,
"eval_wer_tokenizer"
:
null
,
"inferred_w2v_config"
:
null
,
"labels"
:
null
,
"max_sample_size"
:
320000
,
"min_sample_size"
:
32000
,
"normalize"
:
true
,
"num_batch_buckets"
:
0
,
"precompute_mask_indices"
:
false
,
"sample_rate"
:
16000
,
"tpu"
:
true
}
},
"w2v_path"
:
"/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt"
}
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_small.json
0 → 100644
View file @
9dcc7a15
{
"_name"
:
"wav2vec2"
,
"activation_dropout"
:
0.0
,
"activation_fn"
:
"gelu"
,
"attention_dropout"
:
0.1
,
"codebook_negatives"
:
0
,
"conv_bias"
:
false
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2"
,
"conv_pos"
:
128
,
"conv_pos_groups"
:
16
,
"cross_sample_negatives"
:
0
,
"dropout"
:
0.1
,
"dropout_features"
:
0.1
,
"dropout_input"
:
0.1
,
"encoder_attention_heads"
:
12
,
"encoder_embed_dim"
:
768
,
"encoder_ffn_embed_dim"
:
3072
,
"encoder_layerdrop"
:
0.05
,
"encoder_layers"
:
12
,
"extractor_mode"
:
"default"
,
"feature_grad_mult"
:
0.1
,
"final_dim"
:
256
,
"latent_dim"
:
0
,
"latent_groups"
:
2
,
"latent_temp"
:
[
2.0
,
0.5
,
0.999995
],
"latent_vars"
:
320
,
"layer_norm_first"
:
false
,
"logit_temp"
:
0.1
,
"mask_channel_before"
:
false
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.65
,
"mask_selection"
:
"static"
,
"negatives_from_everywhere"
:
false
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"num_negatives"
:
100
,
"quantize_input"
:
false
,
"quantize_targets"
:
true
,
"quantizer_depth"
:
1
,
"quantizer_factor"
:
3
,
"same_quantizer"
:
false
,
"target_glu"
:
false
}
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_small_960h.json
0 → 100644
View file @
9dcc7a15
{
"_name"
:
"wav2vec_ctc"
,
"activation_dropout"
:
0.1
,
"apply_mask"
:
true
,
"attention_dropout"
:
0.0
,
"blank_mode"
:
"add"
,
"blank_weight"
:
0.0
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]"
,
"dropout"
:
0.0
,
"dropout_input"
:
0.0
,
"encoder_embed_dim"
:
512
,
"feature_grad_mult"
:
0.0
,
"final_dropout"
:
0.0
,
"freeze_finetune_updates"
:
0
,
"layerdrop"
:
0.1
,
"mask_channel_before"
:
false
,
"mask_channel_length"
:
64
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.1
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.5
,
"mask_selection"
:
"static"
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"no_pretrained_weights"
:
false
,
"normalize"
:
false
,
"w2v_args"
:
{
"model"
:
{
"_name"
:
"wav2vec2"
,
"activation_dropout"
:
0.0
,
"activation_fn"
:
"gelu"
,
"attention_dropout"
:
0.1
,
"codebook_negatives"
:
0
,
"conv_bias"
:
false
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2"
,
"conv_pos"
:
128
,
"conv_pos_groups"
:
16
,
"cross_sample_negatives"
:
0
,
"dropout"
:
0.1
,
"dropout_features"
:
0.1
,
"dropout_input"
:
0.1
,
"encoder_attention_heads"
:
12
,
"encoder_embed_dim"
:
768
,
"encoder_ffn_embed_dim"
:
3072
,
"encoder_layerdrop"
:
0.05
,
"encoder_layers"
:
12
,
"extractor_mode"
:
"default"
,
"feature_grad_mult"
:
0.1
,
"final_dim"
:
256
,
"latent_dim"
:
0
,
"latent_groups"
:
2
,
"latent_temp"
:
[
2
,
0.5
,
0.999995
],
"latent_vars"
:
320
,
"layer_norm_first"
:
false
,
"logit_temp"
:
0.1
,
"mask_channel_before"
:
false
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.65
,
"mask_selection"
:
"static"
,
"negatives_from_everywhere"
:
false
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"num_negatives"
:
100
,
"quantize_input"
:
false
,
"quantize_targets"
:
true
,
"quantizer_depth"
:
1
,
"quantizer_factor"
:
3
,
"same_quantizer"
:
false
,
"target_glu"
:
false
},
"task"
:
{
"_name"
:
"audio_pretraining"
,
"autoregressive"
:
false
,
"binarized_dataset"
:
false
,
"enable_padding"
:
false
,
"eval_wer"
:
false
,
"eval_wer_config"
:
{
"beam"
:
5
,
"constraints"
:
null
,
"decoding_format"
:
null
,
"diverse_beam_groups"
:
-1
,
"diverse_beam_strength"
:
0.5
,
"diversity_rate"
:
-1.0
,
"iter_decode_eos_penalty"
:
0.0
,
"iter_decode_force_max_iter"
:
false
,
"iter_decode_max_iter"
:
10
,
"iter_decode_with_beam"
:
1
,
"iter_decode_with_external_reranker"
:
false
,
"lenpen"
:
1.0
,
"lm_path"
:
null
,
"lm_weight"
:
0.0
,
"match_source_len"
:
false
,
"max_len_a"
:
0.0
,
"max_len_b"
:
200
,
"min_len"
:
1
,
"nbest"
:
1
,
"no_beamable_mm"
:
false
,
"no_early_stop"
:
false
,
"no_repeat_ngram_size"
:
0
,
"no_seed_provided"
:
false
,
"prefix_size"
:
0
,
"print_alignment"
:
null
,
"print_step"
:
false
,
"replace_unk"
:
null
,
"retain_dropout"
:
false
,
"retain_dropout_modules"
:
null
,
"retain_iter_history"
:
false
,
"sacrebleu"
:
false
,
"sampling"
:
false
,
"sampling_topk"
:
-1
,
"sampling_topp"
:
-1.0
,
"score_reference"
:
false
,
"temperature"
:
1.0
,
"unkpen"
:
0.0
,
"unnormalized"
:
false
},
"eval_wer_post_process"
:
"letter"
,
"eval_wer_tokenizer"
:
null
,
"inferred_w2v_config"
:
null
,
"labels"
:
null
,
"max_sample_size"
:
250000
,
"min_sample_size"
:
32000
,
"normalize"
:
false
,
"num_batch_buckets"
:
0
,
"precompute_mask_indices"
:
false
,
"sample_rate"
:
16000
,
"tpu"
:
true
}
},
"w2v_path"
:
"???"
}
test/torchaudio_unittest/assets/wav2vec2/fairseq/wav2vec_vox_new.json
0 → 100644
View file @
9dcc7a15
{
"_name"
:
"wav2vec2"
,
"activation_dropout"
:
0.0
,
"activation_fn"
:
"gelu"
,
"attention_dropout"
:
0.1
,
"codebook_negatives"
:
0
,
"conv_bias"
:
true
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2"
,
"conv_pos"
:
128
,
"conv_pos_groups"
:
16
,
"cross_sample_negatives"
:
0
,
"dropout"
:
0.0
,
"dropout_features"
:
0.1
,
"dropout_input"
:
0.1
,
"encoder_attention_heads"
:
16
,
"encoder_embed_dim"
:
1024
,
"encoder_ffn_embed_dim"
:
4096
,
"encoder_layerdrop"
:
0.0
,
"encoder_layers"
:
24
,
"extractor_mode"
:
"layer_norm"
,
"feature_grad_mult"
:
1.0
,
"final_dim"
:
768
,
"latent_dim"
:
0
,
"latent_groups"
:
2
,
"latent_temp"
:
[
2.0
,
0.1
,
0.999995
],
"latent_vars"
:
320
,
"layer_norm_first"
:
true
,
"logit_temp"
:
0.1
,
"mask_channel_before"
:
false
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.65
,
"mask_selection"
:
"static"
,
"negatives_from_everywhere"
:
false
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"num_negatives"
:
100
,
"quantize_input"
:
false
,
"quantize_targets"
:
true
,
"quantizer_depth"
:
1
,
"quantizer_factor"
:
3
,
"same_quantizer"
:
false
,
"target_glu"
:
false
}
test/torchaudio_unittest/assets/wav2vec2/fairseq/xlsr_53_56k.json
0 → 100644
View file @
9dcc7a15
{
"_name"
:
"wav2vec2"
,
"activation_dropout"
:
0.0
,
"activation_fn"
:
"gelu"
,
"attention_dropout"
:
0.0
,
"codebook_negatives"
:
0
,
"conv_bias"
:
true
,
"conv_feature_layers"
:
"[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2"
,
"conv_pos"
:
128
,
"conv_pos_groups"
:
16
,
"cross_sample_negatives"
:
0
,
"dropout"
:
0.0
,
"dropout_features"
:
0.0
,
"dropout_input"
:
0.0
,
"encoder_attention_heads"
:
16
,
"encoder_embed_dim"
:
1024
,
"encoder_ffn_embed_dim"
:
4096
,
"encoder_layerdrop"
:
0.0
,
"encoder_layers"
:
24
,
"extractor_mode"
:
"layer_norm"
,
"feature_grad_mult"
:
1.0
,
"final_dim"
:
768
,
"latent_dim"
:
0
,
"latent_groups"
:
2
,
"latent_temp"
:
[
2.0
,
0.1
,
0.999995
],
"latent_vars"
:
320
,
"layer_norm_first"
:
true
,
"logit_temp"
:
0.1
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_length"
:
10
,
"mask_min_space"
:
1
,
"mask_other"
:
0.0
,
"mask_prob"
:
0.65
,
"mask_selection"
:
"static"
,
"negatives_from_everywhere"
:
false
,
"no_mask_channel_overlap"
:
false
,
"no_mask_overlap"
:
false
,
"num_negatives"
:
100
,
"quantize_input"
:
false
,
"quantize_targets"
:
true
,
"same_quantizer"
:
false
,
"target_glu"
:
false
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-base-10k-voxpopuli.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2Model"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
false
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
false
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"group"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
12
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
12
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-base-960h.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2ForCTC"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
false
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
false
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"group"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
12
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
12
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-base.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.0
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2Model"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
false
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
false
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_norm"
:
"group"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.0
,
"freeze_feat_extract_train"
:
true
,
"gradient_checkpointing"
:
true
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_size"
:
768
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.05
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_min_space"
:
1
,
"mask_time_other"
:
0.0
,
"mask_time_prob"
:
0.05
,
"mask_time_selection"
:
"static"
,
"model_type"
:
"wav2vec2"
,
"no_mask_channel_overlap"
:
false
,
"no_mask_time_overlap"
:
false
,
"num_attention_heads"
:
12
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
12
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-960h-lv60-self.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2ForCTC"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
true
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
true
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"layer"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-960h-lv60.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2ForCTC"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
true
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
true
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"layer"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-960h.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2ForCTC"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
false
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
false
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"group"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-lv60.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2Model"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
true
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
true
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"layer"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-xlsr-53-german.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2ForCTC"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
true
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
true
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"layer"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
36
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large-xlsr-53.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.0
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2Model"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
true
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
true
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"layer"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.0
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_channel_length"
:
10
,
"mask_channel_min_space"
:
1
,
"mask_channel_other"
:
0.0
,
"mask_channel_prob"
:
0.0
,
"mask_channel_selection"
:
"static"
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_min_space"
:
1
,
"mask_time_other"
:
0.0
,
"mask_time_prob"
:
0.075
,
"mask_time_selection"
:
"static"
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/facebook/wav2vec2-large.json
0 → 100644
View file @
9dcc7a15
{
"activation_dropout"
:
0.1
,
"apply_spec_augment"
:
true
,
"architectures"
:
[
"Wav2Vec2Model"
],
"attention_dropout"
:
0.1
,
"bos_token_id"
:
1
,
"conv_bias"
:
false
,
"conv_dim"
:
[
512
,
512
,
512
,
512
,
512
,
512
,
512
],
"conv_kernel"
:
[
10
,
3
,
3
,
3
,
3
,
2
,
2
],
"conv_stride"
:
[
5
,
2
,
2
,
2
,
2
,
2
,
2
],
"ctc_loss_reduction"
:
"sum"
,
"ctc_zero_infinity"
:
false
,
"do_stable_layer_norm"
:
false
,
"eos_token_id"
:
2
,
"feat_extract_activation"
:
"gelu"
,
"feat_extract_dropout"
:
0.0
,
"feat_extract_norm"
:
"group"
,
"feat_proj_dropout"
:
0.1
,
"final_dropout"
:
0.1
,
"gradient_checkpointing"
:
false
,
"hidden_act"
:
"gelu"
,
"hidden_dropout"
:
0.1
,
"hidden_dropout_prob"
:
0.1
,
"hidden_size"
:
1024
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"layer_norm_eps"
:
1e-05
,
"layerdrop"
:
0.1
,
"mask_feature_length"
:
10
,
"mask_feature_prob"
:
0.0
,
"mask_time_length"
:
10
,
"mask_time_prob"
:
0.05
,
"model_type"
:
"wav2vec2"
,
"num_attention_heads"
:
16
,
"num_conv_pos_embedding_groups"
:
16
,
"num_conv_pos_embeddings"
:
128
,
"num_feat_extract_layers"
:
7
,
"num_hidden_layers"
:
24
,
"pad_token_id"
:
0
,
"transformers_version"
:
"4.5.1"
,
"vocab_size"
:
32
}
test/torchaudio_unittest/assets/wav2vec2/huggingface/generate_huggingface_model_config.py
0 → 100644
View file @
9dcc7a15
import
os
import
json
from
transformers
import
Wav2Vec2Model
_THIS_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
def
_main
():
keys
=
[
# pretrained
"facebook/wav2vec2-base"
,
"facebook/wav2vec2-large"
,
"facebook/wav2vec2-large-lv60"
,
"facebook/wav2vec2-base-10k-voxpopuli"
,
"facebook/wav2vec2-large-xlsr-53"
,
# finetuned
"facebook/wav2vec2-base-960h"
,
"facebook/wav2vec2-large-960h"
,
"facebook/wav2vec2-large-960h-lv60"
,
"facebook/wav2vec2-large-960h-lv60-self"
,
"facebook/wav2vec2-large-xlsr-53-german"
,
]
for
key
in
keys
:
path
=
os
.
path
.
join
(
_THIS_DIR
,
f
'
{
key
}
.json'
)
print
(
'Generating '
,
path
)
cfg
=
Wav2Vec2Model
.
from_pretrained
(
key
).
config
cfg
=
json
.
loads
(
cfg
.
to_json_string
())
del
cfg
[
'_name_or_path'
]
with
open
(
path
,
'w'
)
as
file_
:
file_
.
write
(
json
.
dumps
(
cfg
,
indent
=
4
,
sort_keys
=
True
))
file_
.
write
(
'
\n
'
)
if
__name__
==
'__main__'
:
_main
()
test/torchaudio_unittest/backend/__init__.py
0 → 100644
View file @
9dcc7a15
test/torchaudio_unittest/backend/common.py
0 → 100644
View file @
9dcc7a15
from
torchaudio_unittest.common_utils
import
sox_utils
def
get_encoding
(
ext
,
dtype
):
exts
=
{
'mp3'
,
'flac'
,
'vorbis'
,
}
encodings
=
{
'float32'
:
'PCM_F'
,
'int32'
:
'PCM_S'
,
'int16'
:
'PCM_S'
,
'uint8'
:
'PCM_U'
,
}
return
ext
.
upper
()
if
ext
in
exts
else
encodings
[
dtype
]
def
get_bits_per_sample
(
ext
,
dtype
):
bits_per_samples
=
{
'flac'
:
24
,
'mp3'
:
0
,
'vorbis'
:
0
,
}
return
bits_per_samples
.
get
(
ext
,
sox_utils
.
get_bit_depth
(
dtype
))
test/torchaudio_unittest/backend/soundfile/__init__.py
0 → 100644
View file @
9dcc7a15
test/torchaudio_unittest/backend/soundfile/common.py
0 → 100644
View file @
9dcc7a15
import
itertools
from
unittest
import
skipIf
from
parameterized
import
parameterized
from
torchaudio._internal.module_utils
import
is_module_available
def
name_func
(
func
,
_
,
params
):
return
f
'
{
func
.
__name__
}
_
{
"_"
.
join
(
str
(
arg
)
for
arg
in
params
.
args
)
}
'
def
dtype2subtype
(
dtype
):
return
{
"float64"
:
"DOUBLE"
,
"float32"
:
"FLOAT"
,
"int32"
:
"PCM_32"
,
"int16"
:
"PCM_16"
,
"uint8"
:
"PCM_U8"
,
"int8"
:
"PCM_S8"
,
}[
dtype
]
def
skipIfFormatNotSupported
(
fmt
):
fmts
=
[]
if
is_module_available
(
"soundfile"
):
import
soundfile
fmts
=
soundfile
.
available_formats
()
return
skipIf
(
fmt
not
in
fmts
,
f
'"
{
fmt
}
" is not supported by soundfile'
)
return
skipIf
(
True
,
'"soundfile" not available.'
)
def
parameterize
(
*
params
):
return
parameterized
.
expand
(
list
(
itertools
.
product
(
*
params
)),
name_func
=
name_func
)
def
fetch_wav_subtype
(
dtype
,
encoding
,
bits_per_sample
):
subtype
=
{
(
None
,
None
):
dtype2subtype
(
dtype
),
(
None
,
8
):
"PCM_U8"
,
(
'PCM_U'
,
None
):
"PCM_U8"
,
(
'PCM_U'
,
8
):
"PCM_U8"
,
(
'PCM_S'
,
None
):
"PCM_32"
,
(
'PCM_S'
,
16
):
"PCM_16"
,
(
'PCM_S'
,
32
):
"PCM_32"
,
(
'PCM_F'
,
None
):
"FLOAT"
,
(
'PCM_F'
,
32
):
"FLOAT"
,
(
'PCM_F'
,
64
):
"DOUBLE"
,
(
'ULAW'
,
None
):
"ULAW"
,
(
'ULAW'
,
8
):
"ULAW"
,
(
'ALAW'
,
None
):
"ALAW"
,
(
'ALAW'
,
8
):
"ALAW"
,
}.
get
((
encoding
,
bits_per_sample
))
if
subtype
:
return
subtype
raise
ValueError
(
f
"wav does not support (
{
encoding
}
,
{
bits_per_sample
}
)."
)
Prev
1
…
7
8
9
10
11
12
13
14
15
…
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment