"cacheflow/server/llm_server.py" did not exist on "7297fa6f7c4ce6413ee005025b312c4c9d4f5f0b"
Commit 9dcc7a15 authored by flyingdown's avatar flyingdown
Browse files

init v0.10.0

parent db2b0b79
Pipeline #254 failed with stages
in 0 seconds
{
"_name": "wav2vec_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"blank_mode": "add",
"blank_weight": 0.0,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
"dropout": 0.0,
"dropout_input": 0.0,
"encoder_embed_dim": 768,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 10000,
"layerdrop": 0.1,
"mask_channel_before": false,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.1,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.1,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": true,
"w2v_args": {
"model": {
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": true,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.0,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 768,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.1,
0.999995
],
"latent_vars": 320,
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
},
"task": {
"_name": "audio_pretraining",
"autoregressive": false,
"binarized_dataset": false,
"enable_padding": false,
"eval_wer": false,
"eval_wer_config": {
"beam": 5,
"constraints": null,
"decoding_format": null,
"diverse_beam_groups": -1,
"diverse_beam_strength": 0.5,
"diversity_rate": -1.0,
"iter_decode_eos_penalty": 0.0,
"iter_decode_force_max_iter": false,
"iter_decode_max_iter": 10,
"iter_decode_with_beam": 1,
"iter_decode_with_external_reranker": false,
"lenpen": 1.0,
"lm_path": null,
"lm_weight": 0.0,
"match_source_len": false,
"max_len_a": 0.0,
"max_len_b": 200,
"min_len": 1,
"nbest": 1,
"no_beamable_mm": false,
"no_early_stop": false,
"no_repeat_ngram_size": 0,
"no_seed_provided": false,
"prefix_size": 0,
"print_alignment": null,
"print_step": false,
"replace_unk": null,
"retain_dropout": false,
"retain_dropout_modules": null,
"retain_iter_history": false,
"sacrebleu": false,
"sampling": false,
"sampling_topk": -1,
"sampling_topp": -1.0,
"score_reference": false,
"temperature": 1.0,
"unkpen": 0.0,
"unnormalized": false
},
"eval_wer_post_process": "letter",
"eval_wer_tokenizer": null,
"inferred_w2v_config": null,
"labels": null,
"max_sample_size": 320000,
"min_sample_size": 32000,
"normalize": true,
"num_batch_buckets": 0,
"precompute_mask_indices": false,
"sample_rate": 16000,
"tpu": true
}
},
"w2v_path": "/private/home/abaevski/models/wav2vec2/wav2vec_vox_new.pt"
}
{
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": false,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.1,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 12,
"encoder_embed_dim": 768,
"encoder_ffn_embed_dim": 3072,
"encoder_layerdrop": 0.05,
"encoder_layers": 12,
"extractor_mode": "default",
"feature_grad_mult": 0.1,
"final_dim": 256,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.5,
0.999995
],
"latent_vars": 320,
"layer_norm_first": false,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
}
{
"_name": "wav2vec_ctc",
"activation_dropout": 0.1,
"apply_mask": true,
"attention_dropout": 0.0,
"blank_mode": "add",
"blank_weight": 0.0,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
"dropout": 0.0,
"dropout_input": 0.0,
"encoder_embed_dim": 512,
"feature_grad_mult": 0.0,
"final_dropout": 0.0,
"freeze_finetune_updates": 0,
"layerdrop": 0.1,
"mask_channel_before": false,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.1,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_selection": "static",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"normalize": false,
"w2v_args": {
"model": {
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": false,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.1,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 12,
"encoder_embed_dim": 768,
"encoder_ffn_embed_dim": 3072,
"encoder_layerdrop": 0.05,
"encoder_layers": 12,
"extractor_mode": "default",
"feature_grad_mult": 0.1,
"final_dim": 256,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2,
0.5,
0.999995
],
"latent_vars": 320,
"layer_norm_first": false,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
},
"task": {
"_name": "audio_pretraining",
"autoregressive": false,
"binarized_dataset": false,
"enable_padding": false,
"eval_wer": false,
"eval_wer_config": {
"beam": 5,
"constraints": null,
"decoding_format": null,
"diverse_beam_groups": -1,
"diverse_beam_strength": 0.5,
"diversity_rate": -1.0,
"iter_decode_eos_penalty": 0.0,
"iter_decode_force_max_iter": false,
"iter_decode_max_iter": 10,
"iter_decode_with_beam": 1,
"iter_decode_with_external_reranker": false,
"lenpen": 1.0,
"lm_path": null,
"lm_weight": 0.0,
"match_source_len": false,
"max_len_a": 0.0,
"max_len_b": 200,
"min_len": 1,
"nbest": 1,
"no_beamable_mm": false,
"no_early_stop": false,
"no_repeat_ngram_size": 0,
"no_seed_provided": false,
"prefix_size": 0,
"print_alignment": null,
"print_step": false,
"replace_unk": null,
"retain_dropout": false,
"retain_dropout_modules": null,
"retain_iter_history": false,
"sacrebleu": false,
"sampling": false,
"sampling_topk": -1,
"sampling_topp": -1.0,
"score_reference": false,
"temperature": 1.0,
"unkpen": 0.0,
"unnormalized": false
},
"eval_wer_post_process": "letter",
"eval_wer_tokenizer": null,
"inferred_w2v_config": null,
"labels": null,
"max_sample_size": 250000,
"min_sample_size": 32000,
"normalize": false,
"num_batch_buckets": 0,
"precompute_mask_indices": false,
"sample_rate": 16000,
"tpu": true
}
},
"w2v_path": "???"
}
{
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.1,
"codebook_negatives": 0,
"conv_bias": true,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.0,
"dropout_features": 0.1,
"dropout_input": 0.1,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 768,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.1,
0.999995
],
"latent_vars": 320,
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_before": false,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"quantizer_depth": 1,
"quantizer_factor": 3,
"same_quantizer": false,
"target_glu": false
}
{
"_name": "wav2vec2",
"activation_dropout": 0.0,
"activation_fn": "gelu",
"attention_dropout": 0.0,
"codebook_negatives": 0,
"conv_bias": true,
"conv_feature_layers": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"cross_sample_negatives": 0,
"dropout": 0.0,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "layer_norm",
"feature_grad_mult": 1.0,
"final_dim": 768,
"latent_dim": 0,
"latent_groups": 2,
"latent_temp": [
2.0,
0.1,
0.999995
],
"latent_vars": 320,
"layer_norm_first": true,
"logit_temp": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.65,
"mask_selection": "static",
"negatives_from_everywhere": false,
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"num_negatives": 100,
"quantize_input": false,
"quantize_targets": true,
"same_quantizer": false,
"target_glu": false
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2Model"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": false,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "group",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 12,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 12,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": false,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "group",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 12,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 12,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.0,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2Model"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": false,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_norm": "group",
"feat_proj_dropout": 0.1,
"final_dropout": 0.0,
"freeze_feat_extract_train": true,
"gradient_checkpointing": true,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-05,
"layerdrop": 0.05,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.05,
"mask_time_selection": "static",
"model_type": "wav2vec2",
"no_mask_channel_overlap": false,
"no_mask_time_overlap": false,
"num_attention_heads": 12,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 12,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": false,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "group",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2Model"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2ForCTC"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 36
}
{
"activation_dropout": 0.0,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2Model"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": true,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": true,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "layer",
"feat_proj_dropout": 0.1,
"final_dropout": 0.0,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_channel_length": 10,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.0,
"mask_channel_selection": "static",
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_min_space": 1,
"mask_time_other": 0.0,
"mask_time_prob": 0.075,
"mask_time_selection": "static",
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
{
"activation_dropout": 0.1,
"apply_spec_augment": true,
"architectures": [
"Wav2Vec2Model"
],
"attention_dropout": 0.1,
"bos_token_id": 1,
"conv_bias": false,
"conv_dim": [
512,
512,
512,
512,
512,
512,
512
],
"conv_kernel": [
10,
3,
3,
3,
3,
2,
2
],
"conv_stride": [
5,
2,
2,
2,
2,
2,
2
],
"ctc_loss_reduction": "sum",
"ctc_zero_infinity": false,
"do_stable_layer_norm": false,
"eos_token_id": 2,
"feat_extract_activation": "gelu",
"feat_extract_dropout": 0.0,
"feat_extract_norm": "group",
"feat_proj_dropout": 0.1,
"final_dropout": 0.1,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"mask_feature_length": 10,
"mask_feature_prob": 0.0,
"mask_time_length": 10,
"mask_time_prob": 0.05,
"model_type": "wav2vec2",
"num_attention_heads": 16,
"num_conv_pos_embedding_groups": 16,
"num_conv_pos_embeddings": 128,
"num_feat_extract_layers": 7,
"num_hidden_layers": 24,
"pad_token_id": 0,
"transformers_version": "4.5.1",
"vocab_size": 32
}
import os
import json
from transformers import Wav2Vec2Model
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
def _main():
keys = [
# pretrained
"facebook/wav2vec2-base",
"facebook/wav2vec2-large",
"facebook/wav2vec2-large-lv60",
"facebook/wav2vec2-base-10k-voxpopuli",
"facebook/wav2vec2-large-xlsr-53",
# finetuned
"facebook/wav2vec2-base-960h",
"facebook/wav2vec2-large-960h",
"facebook/wav2vec2-large-960h-lv60",
"facebook/wav2vec2-large-960h-lv60-self",
"facebook/wav2vec2-large-xlsr-53-german",
]
for key in keys:
path = os.path.join(_THIS_DIR, f'{key}.json')
print('Generating ', path)
cfg = Wav2Vec2Model.from_pretrained(key).config
cfg = json.loads(cfg.to_json_string())
del cfg['_name_or_path']
with open(path, 'w') as file_:
file_.write(json.dumps(cfg, indent=4, sort_keys=True))
file_.write('\n')
if __name__ == '__main__':
_main()
from torchaudio_unittest.common_utils import sox_utils
def get_encoding(ext, dtype):
exts = {
'mp3',
'flac',
'vorbis',
}
encodings = {
'float32': 'PCM_F',
'int32': 'PCM_S',
'int16': 'PCM_S',
'uint8': 'PCM_U',
}
return ext.upper() if ext in exts else encodings[dtype]
def get_bits_per_sample(ext, dtype):
bits_per_samples = {
'flac': 24,
'mp3': 0,
'vorbis': 0,
}
return bits_per_samples.get(ext, sox_utils.get_bit_depth(dtype))
import itertools
from unittest import skipIf
from parameterized import parameterized
from torchaudio._internal.module_utils import is_module_available
def name_func(func, _, params):
return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
def dtype2subtype(dtype):
return {
"float64": "DOUBLE",
"float32": "FLOAT",
"int32": "PCM_32",
"int16": "PCM_16",
"uint8": "PCM_U8",
"int8": "PCM_S8",
}[dtype]
def skipIfFormatNotSupported(fmt):
fmts = []
if is_module_available("soundfile"):
import soundfile
fmts = soundfile.available_formats()
return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
return skipIf(True, '"soundfile" not available.')
def parameterize(*params):
return parameterized.expand(list(itertools.product(*params)), name_func=name_func)
def fetch_wav_subtype(dtype, encoding, bits_per_sample):
subtype = {
(None, None): dtype2subtype(dtype),
(None, 8): "PCM_U8",
('PCM_U', None): "PCM_U8",
('PCM_U', 8): "PCM_U8",
('PCM_S', None): "PCM_32",
('PCM_S', 16): "PCM_16",
('PCM_S', 32): "PCM_32",
('PCM_F', None): "FLOAT",
('PCM_F', 32): "FLOAT",
('PCM_F', 64): "DOUBLE",
('ULAW', None): "ULAW",
('ULAW', 8): "ULAW",
('ALAW', None): "ALAW",
('ALAW', 8): "ALAW",
}.get((encoding, bits_per_sample))
if subtype:
return subtype
raise ValueError(
f"wav does not support ({encoding}, {bits_per_sample}).")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment