init

12c90639 · “change” · 417b607b · 12c90639 · 12c90639 · 12c90639
Commit 12c90639 authored Sep 28, 2024 by “change”
20 changed files
--- a/Speech2S/speech2s/data/load_langpair_dataset.py
+++ b/Speech2S/speech2s/data/load_langpair_dataset.py
--- a/Speech2S/speech2s/data/multimodal_corpus_dataset.py
+++ b/Speech2S/speech2s/data/multimodal_corpus_dataset.py
--- a/Speech2S/speech2s/models/__init__.py
+++ b/Speech2S/speech2s/models/__init__.py
--- a/Speech2S/speech2s/models/speechut.py
+++ b/Speech2S/speech2s/models/speechut.py
--- a/Speech2S/speech2s/models/speechut_asr.py
+++ b/Speech2S/speech2s/models/speechut_asr.py
--- a/Speech2S/speech2s/models/speechut_st.py
+++ b/Speech2S/speech2s/models/speechut_st.py
--- a/Speech2S/speech2s/models/t5_transformer_lm.py
+++ b/Speech2S/speech2s/models/t5_transformer_lm.py
+# --------------------------------------------------------
+# Pre-Training Transformer Decoder for End-to-End ASR Model with Unpaired Speech Data (https://arxiv.org/abs/2203.17113)
+# Github source: https://github.com/microsoft/SpeechT5/tree/main/Speech2C
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+from fairseq.models import (
+    register_model_architecture,
+)
+from fairseq.models.transformer_lm import base_lm_architecture
+@register_model_architecture(model_name="transformer_lm", arch_name="transformer_lm_t5")
+def transformer_lm_t5(args):
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6144)
+    args.decoder_layers = getattr(args, "decoder_layers", 20)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    base_lm_architecture(args)
--- a/Speech2S/speech2s/modules/__init__.py
+++ b/Speech2S/speech2s/modules/__init__.py
--- a/Speech2S/speech2s/modules/ctc_prefix_score.py
+++ b/Speech2S/speech2s/modules/ctc_prefix_score.py
--- a/Speech2S/speech2s/modules/learned_positional_embedding.py
+++ b/Speech2S/speech2s/modules/learned_positional_embedding.py
--- a/Speech2S/speech2s/modules/multihead_attention.py
+++ b/Speech2S/speech2s/modules/multihead_attention.py
--- a/Speech2S/speech2s/modules/relative_pos_enc.py
+++ b/Speech2S/speech2s/modules/relative_pos_enc.py
--- a/Speech2S/speech2s/modules/transformer_decoder.py
+++ b/Speech2S/speech2s/modules/transformer_decoder.py
--- a/Speech2S/speech2s/modules/transformer_encoder.py
+++ b/Speech2S/speech2s/modules/transformer_encoder.py
--- a/Speech2S/speech2s/modules/transformer_layer.py
+++ b/Speech2S/speech2s/modules/transformer_layer.py
--- a/Speech2S/speech2s/modules/w2v_encoder.py
+++ b/Speech2S/speech2s/modules/w2v_encoder.py
--- a/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_asr.sh
+++ b/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_asr.sh
--- a/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st.sh
+++ b/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st.sh
--- a/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st_enfr.sh
+++ b/Speech2S/speech2s/scripts copy/pretrain_speechut/base_speechut_for_st_enfr.sh
--- a/Speech2S/speech2s/scripts copy/pretrain_speechut/large_speechut_for_asr.sh
+++ b/Speech2S/speech2s/scripts copy/pretrain_speechut/large_speechut_for_asr.sh