ConformerTransducerMedium.json 2.44 KB
Newer Older
burchim's avatar
burchim committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
{
    "model_name": "Conformer Transducer Medium",
    "model_type": "Transducer",

    "encoder_params": 
    {
        "arch": "Conformer",
        "num_blocks": 16,
        "dim_model": 256,
        "ff_ratio": 4,
        "num_heads": 4,
        "kernel_size": 31,
        "Pdrop": 0.1,

        "relative_pos_enc": true,
        "max_pos_encoding": 10000,

        "subsampling_module": "Conv2d",
        "subsampling_layers": 2,
        "subsampling_filters": [256, 256],
        "subsampling_kernel_size": 3,
        "subsampling_norm": "batch",
        "subsampling_act": "swish",

        "sample_rate": 16000,
        "win_length_ms": 25,
        "hop_length_ms": 10,
        "n_fft": 512,
        "n_mels": 80,
        "normalize": false,
        "mean": -5.6501,
        "std": 4.2280,

        "spec_augment": true,
        "mF": 2,
        "F": 27,
        "mT": 10,
        "pS": 0.05
    },

    "decoder_params":
    {
        "arch": "RNN",
        "num_layers": 1,
        "dim_model": 640,
        "vocab_size": 1000
    },

    "joint_params":
    {
        "joint_mode": "sum",
        "dim_model": 640,
        "act": "tanh"
    },
    
    "tokenizer_params":
    {
        "tokenizer_path": "datasets/LibriSpeech/LibriSpeech_bpe_1000.model",
        "vocab_type": "bpe",
        "vocab_size": 1000
    },

    "training_params":
    {
        "epochs": 250,
        "batch_size": 8,
        "accumulated_steps": 8,
        "mixed_precision": true,

        "optimizer": "Adam",
        "beta1": 0.9,
        "beta2": 0.98,
        "eps": 1e-9,
        "weight_decay": 1e-6,

        "lr_schedule": "Transformer",
        "schedule_dim": 256,
        "warmup_steps": 10000,
        "K": 2,

        "vn_start_step": 20000,
        "vn_std": 0.075,

        "train_audio_max_length": 256000,
        "train_label_max_length": 90,
        "eval_audio_max_length": null,
        "eval_label_max_length": null,

        "training_dataset": "LibriSpeech",
        "training_dataset_path": "datasets/LibriSpeech/",

        "evaluation_dataset": "LibriSpeech",
        "evaluation_dataset_path": "datasets/LibriSpeech/",

        "callback_path": "callbacks/ConformerTransducerMedium/"
    },

    "decoding_params":
    {
        "beam_size": 16,
        "tmp": 1,

        "ngram_path": "callbacks/ngram/6gram_1000.arpa",
        "ngram_alpha": 0.3,
        "ngram_beta": 1,

        "lm_config": "configs/LM-Transformer.json",
        "lm_weight": 1,
        "lm_tmp": 1
    }
}