init

12c90639 · “change” · 417b607b · 12c90639 · 12c90639 · 12c90639
Commit 12c90639 authored Sep 28, 2024 by “change”
20 changed files
--- a/Speech2S/speech2s/scripts/sacrebleu.sh
+++ b/Speech2S/speech2s/scripts/sacrebleu.sh
+#!/bin/bash
+
+if [ $# -ne 4 ]; then
+    echo "usage: $0 TESTSET SRCLANG TGTLANG GEN"
+    exit 1
+fi
+
+TESTSET=$1
+SRCLANG=$2
+TGTLANG=$3
+
+GEN=$4
+
+if ! command -v sacremoses &> /dev/null
+then
+    echo "sacremoses could not be found, please install with: pip install sacremoses"
+    exit
+fi
+
+grep ^H $GEN \
+| sed 's/^H\-//' \
+| sort -n -k 1 \
+| cut -f 3 \
+| sacremoses detokenize \
+> $GEN.sorted.detok
+
+sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok
--- a/Speech2S/speech2s/scripts/shard_docs.py
+++ b/Speech2S/speech2s/scripts/shard_docs.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Split a large file into shards while respecting document boundaries. Documents
+should be separated by a single empty line.
+"""
+
+import argparse
+import contextlib
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("--num-shards", type=int)
+    args = parser.parse_args()
+
+    assert args.num_shards is not None and args.num_shards > 1
+
+    with open(args.input, "r", encoding="utf-8") as h:
+        with contextlib.ExitStack() as stack:
+            outputs = [
+                stack.enter_context(
+                    open(args.input + ".shard" + str(i), "w", encoding="utf-8")
+                )
+                for i in range(args.num_shards)
+            ]
+
+            doc = []
+            first_doc = [True] * args.num_shards
+
+            def output_doc(i):
+                if not first_doc[i]:
+                    outputs[i].write("\n")
+                first_doc[i] = False
+                for line in doc:
+                    outputs[i].write(line)
+                doc.clear()
+
+            num_docs = 0
+            for line in h:
+                if line.strip() == "":  # empty line indicates new document
+                    output_doc(num_docs % args.num_shards)
+                    num_docs += 1
+                else:
+                    doc.append(line)
+            output_doc(num_docs % args.num_shards)
+
+
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/split_train_valid_docs.py
+++ b/Speech2S/speech2s/scripts/split_train_valid_docs.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Split a large file into a train and valid set while respecting document
+boundaries. Documents should be separated by a single empty line.
+"""
+
+import argparse
+import random
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("sample_output", help="train output file")
+    parser.add_argument("remainder_output", help="valid output file")
+    parser.add_argument("-k", type=int, help="remainder size")
+    parser.add_argument(
+        "--lines", action="store_true", help="split lines instead of docs"
+    )
+    args = parser.parse_args()
+
+    assert args.k is not None
+
+    sample = []
+    remainder = []
+    num_docs = [0]
+
+    def update_sample(doc):
+        if len(sample) < args.k:
+            sample.append(doc.copy())
+        else:
+            i = num_docs[0]
+            j = random.randrange(i + 1)
+            if j < args.k:
+                remainder.append(sample[j])
+                sample[j] = doc.copy()
+            else:
+                remainder.append(doc.copy())
+        num_docs[0] += 1
+        doc.clear()
+
+    with open(args.input, "r", encoding="utf-8") as h:
+        doc = []
+        for i, line in enumerate(h):
+            if line.strip() == "":  # empty line indicates new document
+                update_sample(doc)
+            else:
+                doc.append(line)
+            if args.lines:
+                update_sample(doc)
+            if i % 1000000 == 0:
+                print(i, file=sys.stderr, end="", flush=True)
+            elif i % 100000 == 0:
+                print(".", file=sys.stderr, end="", flush=True)
+        if len(doc) > 0:
+            update_sample(doc)
+    print(file=sys.stderr, flush=True)
+
+    assert len(sample) == args.k
+
+    with open(args.sample_output, "w", encoding="utf-8") as out:
+        first = True
+        for doc in sample:
+            if not first and not args.lines:
+                out.write("\n")
+            first = False
+            for line in doc:
+                out.write(line)
+
+    with open(args.remainder_output, "w", encoding="utf-8") as out:
+        first = True
+        for doc in remainder:
+            if not first and not args.lines:
+                out.write("\n")
+            first = False
+            for line in doc:
+                out.write(line)
+
+
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/spm_decode.py
+++ b/Speech2S/speech2s/scripts/spm_decode.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="sentencepiece model to use for decoding"
+    )
+    parser.add_argument("--input", required=True, help="input file to decode")
+    parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
+    args = parser.parse_args()
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.input_format == "piece":
+
+        def decode(input):
+            return "".join(sp.DecodePieces(input))
+
+    elif args.input_format == "id":
+
+        def decode(input):
+            return "".join(sp.DecodeIds(input))
+
+    else:
+        raise NotImplementedError
+
+    def tok2int(tok):
+        # remap reference-side <unk> (represented as <<unk>>) to 0
+        return int(tok) if tok != "<<unk>>" else 0
+
+    with open(args.input, "r", encoding="utf-8") as h:
+        for line in h:
+            if args.input_format == "id":
+                print(decode(list(map(tok2int, line.rstrip().split()))))
+            elif args.input_format == "piece":
+                print(decode(line.rstrip().split()))
+
+
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/spm_encode.py
+++ b/Speech2S/speech2s/scripts/spm_encode.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", required=True, help="sentencepiece model to use for encoding"
+    )
+    parser.add_argument(
+        "--inputs", nargs="+", default=["-"], help="input files to filter/encode"
+    )
+    parser.add_argument(
+        "--outputs", nargs="+", default=["-"], help="path to save encoded outputs"
+    )
+    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+    parser.add_argument(
+        "--min-len",
+        type=int,
+        metavar="N",
+        help="filter sentence pairs with fewer than N tokens",
+    )
+    parser.add_argument(
+        "--max-len",
+        type=int,
+        metavar="N",
+        help="filter sentence pairs with more than N tokens",
+    )
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(
+        args.outputs
+    ), "number of input and output paths should match"
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.output_format == "piece":
+
+        def encode(input):
+            return sp.EncodeAsPieces(input)
+
+    elif args.output_format == "id":
+
+        def encode(input):
+            return list(map(str, sp.EncodeAsIds(input)))
+
+    else:
+        raise NotImplementedError
+
+    if args.min_len is not None or args.max_len is not None:
+
+        def valid(line):
+            return (args.min_len is None or len(line) >= args.min_len) and (
+                args.max_len is None or len(line) <= args.max_len
+            )
+
+    else:
+
+        def valid(lines):
+            return True
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-"
+            else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-"
+            else sys.stdout
+            for output in args.outputs
+        ]
+
+        stats = {
+            "num_empty": 0,
+            "num_filtered": 0,
+        }
+
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+
+        for i, lines in enumerate(zip(*inputs), start=1):
+            enc_lines = list(map(encode_line, lines))
+            if not any(enc_line is None for enc_line in enc_lines):
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(" ".join(enc_line), file=output_h)
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/spm_train.py
+++ b/Speech2S/speech2s/scripts/spm_train.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+
+import sentencepiece as spm
+
+
+if __name__ == "__main__":
+    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
--- a/Speech2S/speech2s/scripts/test_fsdp.sh
+++ b/Speech2S/speech2s/scripts/test_fsdp.sh
+#!/usr/bin/env bash
+rm -rf fsdp_dummy
+mkdir -p fsdp_dummy
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
+    --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+    --cpu-offload --checkpoint-activations \
+    --task language_modeling --tokens-per-sample 256 --batch-size 8 \
+    --arch transformer_lm_gpt2_tiny \
+    --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+    --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+    --max-update 5 --log-format json --log-interval 1 \
+    --save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \
+    --restore-file x.pt "$@"
+
+# Now we try to load the checkpoint
+CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
+    --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
+    --cpu-offload --checkpoint-activations \
+    --task language_modeling --tokens-per-sample 256 --batch-size 8 \
+    --arch transformer_lm_gpt2_tiny \
+    --optimizer cpu_adam --adam-betas "(0.9,0.98)" \
+    --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
+    --max-update 2 --log-format json --log-interval 1 \
+    --save-interval-updates 2 --save-dir fsdp_dummy
--- a/Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh
+++ b/Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh
+
+# ####################################
+# Hubert SCT2T ED model #
+# ####################################
+
+world_size=$1
+update_freq=$2
+exp_name=$3
+[ -z $world_size ] && world_size=8
+[ -z $update_freq ] && update_freq=1
+[ -z $exp_name ] && exp_name=sc2t_base_enes_${world_size}gpu_${update_freq}accum6666
+
+
+FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
+CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
+DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_enes"
+TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_enes/bin-idx"
+MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_enes/$exp_name"
+
+[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
+
+
+python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
+  --config-dir $CONFIG_DIR/pretrain \
+  --config-name sc2t_base_librispeech \
+  \
+  +task.store_labels=true \
+  task.labels='["km"]' \
+  model.label_rate=50 \
+  task.data=$DATA_DIR \
+  task.label_dir=$DATA_DIR \
+  task.text_cfg.text_data=$TEXT_DATA_DIR \
+  +task.text_cfg.data_config=config.yaml \
+  task.text_cfg.text_maxtokens_ratio=3.0 \
+  \
+  +criterion.dec_loss_type="ce" \
+  \
+  criterion.text_weight=1.0 \
+  \
+  model.use_rel_pos_enc=true \
+  +model.code_use_rel_pos_enc=true \
+  +model.pad_with_code=true \
+  model.text_transformer.no_scale_embedding=true \
+  model.text_transformer.layernorm_embedding=true \
+  +model.share_decoder_input_output_embed=true \
+  \
+  dataset.train_subset=\"train_all+en.kmu-spm\" \
+  dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
+  dataset.num_workers=0 \
+  dataset.max_tokens=1000000 \
+  optimization.update_freq=[${update_freq}] \
+  optimization.max_update=400000 \
+  \
+  distributed_training.distributed_world_size=${world_size} \
+  \
+  common.tensorboard_logdir=$MODEL_DIR \
+  checkpoint.save_dir=$MODEL_DIR \
+  hydra.run.dir=$MODEL_DIR \
+  hydra.job.name=${exp_name}
+
+
+sleep 5m
+echo "All finished"
+
--- a/Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh
+++ b/Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh
+
+# ####################################
+# Hubert SCT2T ED model #
+# ####################################
+
+world_size=$1
+update_freq=$2
+exp_name=$3
+[ -z $world_size ] && world_size=24
+[ -z $update_freq ] && update_freq=3
+[ -z $exp_name ] && exp_name=sc2t_base_esen_${world_size}gpu_${update_freq}accum1
+
+
+FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
+CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
+DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_esen"
+TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_esen"
+MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_esen/$exp_name"
+
+[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
+
+
+python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
+  --config-dir $CONFIG_DIR/pretrain \
+  --config-name sc2t_base_librispeech \
+  \
+  +task.store_labels=true \
+  task.labels='["km"]' \
+  model.label_rate=50 \
+  task.data=$DATA_DIR \
+  task.label_dir=$DATA_DIR \
+  task.text_cfg.text_data=$TEXT_DATA_DIR \
+  +task.text_cfg.data_config=config.yaml \
+  task.text_cfg.text_maxtokens_ratio=3.0 \
+  \
+  +criterion.dec_loss_type="ce" \
+  \
+  criterion.text_weight=1.0 \
+  \
+  model.use_rel_pos_enc=true \
+  +model.code_use_rel_pos_enc=true \
+  +model.pad_with_code=true \
+  model.text_transformer.no_scale_embedding=true \
+  model.text_transformer.layernorm_embedding=true \
+  +model.share_decoder_input_output_embed=true \
+  \
+  dataset.train_subset=\"train+en.kmu-spm\" \
+  dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
+  dataset.num_workers=0 \
+  dataset.max_tokens=1000000 \
+  optimization.update_freq=[${update_freq}] \
+  optimization.max_update=400000 \
+  \
+  distributed_training.distributed_world_size=${world_size} \
+  \
+  common.tensorboard_logdir=$MODEL_DIR \
+  checkpoint.save_dir=$MODEL_DIR \
+  hydra.run.dir=$MODEL_DIR \
+  hydra.job.name=${exp_name}
+
+
+sleep 5m
+echo "All finished"
+
--- a/Speech2S/speech2s/stpretrain_scripts/config.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config.yaml
+audio_root: ./
+standardize_audio: true
+use_audio_input: true
+vocab_filename: dict.txt
--- a/Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/base_100h.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/base_100h.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+
+checkpoint:
+  save_interval: 1
+  keep_last_epochs: 5
+  keep_best_checkpoints: 5
+  best_checkpoint_metric: wer
+  restore_file: checkpoint_last.pt
+
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 1
+  distributed_port: -1
+  nprocs_per_node: 8
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  fine_tuning: true
+  label_dir: ???
+  normalize: false  # must be consistent with pre-training
+  labels: ["ltr"]
+  single_target: true
+  add_decoder: false
+  pad_audio: false
+  random_crop: true
+  tokenizer: "none"
+  sp_path: None
+
+dataset:
+  num_workers: 0
+  max_tokens: 1200000
+  skip_invalid_size_inputs_valid_test: true
+  train_subset: train_100
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+
+criterion:
+  _name: label_smoothed_cross_entropy
+  #zero_infinity: true
+
+
+optimization:
+  max_update: 80000
+  lr: [0.00003]
+  sentence_avg: true
+  update_freq: [1]
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+  weight_decay: 0.0
+
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+
+model:
+  _name: hubert_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.65
+  mask_channel_prob: 0.5
+  mask_channel_length: 64
+  layerdrop: 0.1
+  decoder_layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 0
+  add_decoder: false
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_interval: 1
+  keep_last_epochs: 10
+  keep_best_checkpoints: 5
+  best_checkpoint_metric: wer
+  restore_file: checkpoint_last.pt
+
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 24
+  distributed_port: -1
+  nprocs_per_node: 8
+
+task:
+  _name: hubert_pretraining
+  data: ???
+  fine_tuning: true
+  label_dir: ???
+  normalize: true  # must be consistent with pre-training
+  labels: ["ltr"]
+  single_target: true
+  add_decoder: false
+  pad_audio: false
+  random_crop: true
+  tokenizer: "none"
+  sp_path: None
+
+dataset:
+  num_workers: 0
+  max_tokens: 1280000
+  skip_invalid_size_inputs_valid_test: true
+  valid_subset: dev_other
+  required_batch_size_multiple: 1
+
+criterion:
+  _name: ctc
+  zero_infinity: true
+
+optimization:
+  max_update: 200000
+  lr: [0.00003]
+  sentence_avg: true
+  update_freq: [1]
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-08
+  weight_decay: 0.0
+
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+
+model:
+  _name: hubert_ctc
+  w2v_path: ???
+  apply_mask: true
+  mask_prob: 0.5
+  mask_channel_prob: 0.25
+  mask_channel_length: 64
+  layerdrop: 0.0
+  decoder_layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 0
+  add_decoder: false
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+          - criterion.wer_kenlm_model
+          - criterion.wer_lexicon
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/Speech2S/speech2s/stpretrain_scripts/config/pretrain/mbart.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config/pretrain/mbart.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_dir: ???
+  save_interval: 4
+  keep_last_epochs: 4
+  save_interval_updates: 20000
+  keep_interval_updates: -1
+  keep_interval_updates_pattern: 50000
+  # no_epoch_checkpoints: true
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 8
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: denoising
+  data: ???
+  mask: 0.15
+
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: ${checkpoint.save_interval}
+  validate_interval_updates: ${checkpoint.save_interval_updates}
+  required_batch_size_multiple: 1
+
+criterion:
+  _name: sc2t
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+  label_smoothing: 0.1
+  text_weight: 0.1
+
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: stbert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: default
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layers: 6
+  encoder_attention_heads: 8
+  decoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  use_rel_pos_enc: true
+  add_code_encoder: true
+  add_adaptor: false
+  text_transformer:
+    activation_fn: ${model.activation_fn}
+    dropout: ${model.dropout}
+    attention_dropout: ${model.attention_dropout}
+    activation_dropout: ${model.activation_dropout}
+    adaptive_input: ${model.adaptive_input}
+    max_source_positions: 3000
+    checkpoint_activations: ${model.checkpoint_activations}
+    no_scale_embedding: false
+    layernorm_embedding: false
+    quant_noise:
+      pq: ${model.quant_noise_pq}
+    encoder:
+      embed_dim: 768
+      ffn_embed_dim: 3072
+      layers: 6
+      attention_heads: 8
+      normalize_before: false
+      learned_pos: true
+      layerdrop: ${model.encoder_layerdrop}
+     
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/Speech2S/speech2s/stpretrain_scripts/config/pretrain/sc2t_base_librispeech.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config/pretrain/sc2t_base_librispeech.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_dir: ???
+  save_interval: 4
+  keep_last_epochs: 4
+  save_interval_updates: 20000
+  keep_interval_updates: -1
+  keep_interval_updates_pattern: 50000
+  # no_epoch_checkpoints: true
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 8
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: joint_sc2t_pretraining
+  data: ???
+  label_dir: ???
+  labels: ???
+  label_rate: ${model.label_rate}
+  sample_rate: 16000
+  max_sample_size: 250000
+  min_sample_size: 32000
+  pad_audio: false
+  random_crop: true
+  normalize: false # must be consistent with extractor
+  add_decoder: true
+  text_cfg:
+    seed: ${common.seed}
+    text_data: ???
+    sample_break_mode: eos
+    tokens_per_sample: 1024
+    shorten_method: "random_crop"
+    text_maxtokens_ratio: 1.0
+
+
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: ${checkpoint.save_interval}
+  validate_interval_updates: ${checkpoint.save_interval_updates}
+  required_batch_size_multiple: 1
+
+criterion:
+  _name: sc2t
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+  label_smoothing: 0.1
+  text_weight: 0.1
+
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: stbert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: default
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layers: 6
+  encoder_attention_heads: 8
+  decoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  use_rel_pos_enc: true
+  add_code_encoder: true
+  add_adaptor: false
+  text_transformer:
+    activation_fn: ${model.activation_fn}
+    dropout: ${model.dropout}
+    attention_dropout: ${model.attention_dropout}
+    activation_dropout: ${model.activation_dropout}
+    adaptive_input: ${model.adaptive_input}
+    max_source_positions: 3000
+    checkpoint_activations: ${model.checkpoint_activations}
+    no_scale_embedding: false
+    layernorm_embedding: false
+    quant_noise:
+      pq: ${model.quant_noise_pq}
+    encoder:
+      embed_dim: 768
+      ffn_embed_dim: 3072
+      layers: 6
+      attention_heads: 8
+      normalize_before: false
+      learned_pos: true
+      layerdrop: ${model.encoder_layerdrop}
+     
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/Speech2S/speech2s/stpretrain_scripts/config/translation/text2code.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config/translation/text2code.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  tensorboard_logdir: tblog
+  seed: 1337
+
+checkpoint:
+  save_interval: 1000000
+  keep_last_epochs: 5
+  save_interval_updates: 1000
+  keep_interval_updates_pattern: 10000
+  keep_interval_updates: 5
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+
+distributed_training:
+  ddp_backend: c10d
+  find_unused_parameters: true
+  distributed_world_size: 1
+  nprocs_per_node: 8
+
+
+criterion:
+  _name: "label_smoothed_cross_entropy"
+
+
+task:
+  _name: "translation_from_jst"
+
+dataset:
+  num_workers: 0
+  max_tokens: 4096
+  skip_invalid_size_inputs_valid_test: true
+  validate_after_updates: ${model.freeze_finetune_updates}
+  validate_interval: ${checkpoint.save_interval}
+  validate_interval_updates: ${checkpoint.save_interval_updates}
+  train_subset: train_clean_100
+  valid_subset: dev_clean
+  required_batch_size_multiple: 1
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.0
+
+lr_scheduler:
+  _name: tri_stage
+  phase_ratio: [0.1, 0.4, 0.5]
+  final_lr_scale: 0.05
+
+model:
+  _name: hubert_t2c
+  w2v_path: ???
+  layerdrop: 0.1
+  decoder_layerdrop: 0.1
+  activation_dropout: 0.1
+  feature_grad_mult: 0.0
+  freeze_finetune_updates: 0
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+          - model.w2v_path
+          - dataset.train_subset
+          - dataset.valid_subset
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/Speech2S/speech2s/stpretrain_scripts/config_mbart.yaml
+++ b/Speech2S/speech2s/stpretrain_scripts/config_mbart.yaml
+# @package _group_
+
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+  seed: 1337
+  tensorboard_logdir: tblog
+
+checkpoint:
+  save_dir: ???
+  save_interval: 4
+  keep_last_epochs: 4
+  save_interval_updates: 20000
+  keep_interval_updates: -1
+  keep_interval_updates_pattern: 50000
+  # no_epoch_checkpoints: true
+
+distributed_training:
+  ddp_backend: no_c10d
+  distributed_backend: 'nccl'
+  distributed_world_size: 8
+  nprocs_per_node: 8
+  find_unused_parameters: true
+
+task:
+  _name: denoising
+  data: ???
+  mask: 0.15
+
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+  validate_interval: ${checkpoint.save_interval}
+  validate_interval_updates: ${checkpoint.save_interval_updates}
+  required_batch_size_multiple: 1
+
+criterion:
+  _name: sc2t
+  pred_masked_weight: 1.0
+  pred_nomask_weight: 0.0
+  loss_weights: [10,]
+  label_smoothing: 0.1
+  text_weight: 0.1
+
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+  clip_norm: 10.0
+
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+
+model:
+  _name: stbert
+  label_rate: ???
+  skip_masked: false
+  skip_nomask: false
+  mask_prob: 0.80
+  extractor_mode: default
+  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
+  final_dim: 256
+  encoder_layers: 6
+  encoder_attention_heads: 8
+  decoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  dropout: 0.1
+  attention_dropout: 0.1
+  feature_grad_mult: 0.1
+  untie_final_proj: true
+  activation_dropout: 0.0
+  use_rel_pos_enc: true
+  add_code_encoder: true
+  add_adaptor: false
+  text_transformer:
+    activation_fn: ${model.activation_fn}
+    dropout: ${model.dropout}
+    attention_dropout: ${model.attention_dropout}
+    activation_dropout: ${model.activation_dropout}
+    adaptive_input: ${model.adaptive_input}
+    max_source_positions: 3000
+    checkpoint_activations: ${model.checkpoint_activations}
+    no_scale_embedding: false
+    layernorm_embedding: false
+    quant_noise:
+      pq: ${model.quant_noise_pq}
+    encoder:
+      embed_dim: 768
+      ffn_embed_dim: 3072
+      layers: 6
+      attention_heads: 8
+      normalize_before: false
+      learned_pos: true
+      layerdrop: ${model.encoder_layerdrop}
+     
+
+hydra:
+  job:
+    config:
+      override_dirname:
+        kv_sep: '-'
+        item_sep: '__'
+        exclude_keys:
+          - run
+          - task.data
+          - task.label_dir
+  run:
+    dir: ???
+  sweep:
+    dir: ???
+    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
--- a/Speech2S/speech2s/stpretrain_scripts/data_process/extract_hubert_feature_itp.sh
+++ b/Speech2S/speech2s/stpretrain_scripts/data_process/extract_hubert_feature_itp.sh
+
+if [ ! -d ${HOME}/azcopy_linux_amd64_10.11.0 ]; then
+    CURRENT_DIR=`pwd`
+    cd ${HOME} && wget https://azcopyvnext.azureedge.net/release20210616/azcopy_linux_amd64_10.11.0.tar.gz && tar -zxvf azcopy_linux_amd64_10.11.0.tar.gz && rm -f azcopy_linux_amd64_10.11.0.tar.gz && cd ${CURRENT_DIR}
+fi
+export PATH=$PATH:${HOME}/azcopy_linux_amd64_10.11.0/:${HOME}/.local/bin
+export PYTHONPATH=$PYTHONPATH:/mnt/output/users/v-kunwei/code/fairseq
+
+rank=$1
+nshard=$2
+split=$3
+[ -z $rank ] && echo "please specify rank"
+[ -z $nshard ] && nshard=1
+[ -z $split ] && split="train"
+
+
+FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq
+ckpt_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3.pt
+tsv_dir=/home/v-kunwei
+
+feat_dir=${HOME}/$split
+python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} 9 ${nshard} ${rank} ${feat_dir} || exit 1
+
+
+echo "-------------------------------------------------------------------------------------------"
+echo "----------------------------------    done    ---------------------------------------------"
+echo "-------------------------------------------------------------------------------------------"
+
+km_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin 
+lab_dir=${HOME}/${split}
+python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
+
+
+# sas="?sv=2020-08-04&st=2022-01-02T04%3A58%3A15Z&se=2022-06-01T04%3A58%3A00Z&sr=c&sp=racwdl&sig=NyZKOHivgesEoZ8yvLsVT6aZMYQZMevLLmXNOTaWyvU%3D"
+# blob="https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-ziqzhang/data/stbert/data/librispeech/libri_960/hubert_release_iter2_layer9_kmeans/${split}"
+# azcopy copy $feat_dir/${split}_${rank}_${nshard}.len "$blob/$sas"
+# azcopy copy $feat_dir/${split}_${rank}_${nshard}.npy "$blob/$sas"
+# azcopy copy $lab_dir "$blob/$sas" --recursive
+
+
+
--- a/Speech2S/speech2s/stpretrain_scripts/data_process/merge_code.py
+++ b/Speech2S/speech2s/stpretrain_scripts/data_process/merge_code.py
+import sys
+import torch
+
+
+def main():
+    for line in sys.stdin:
+        line = line.rstrip()
+        codes = list(map(int, line.split()))
+        merged_codes = torch.unique_consecutive(torch.tensor(codes)).numpy()
+        merged_codes = map(str, merged_codes)
+        print(" ".join(merged_codes))
+
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh
+++ b/Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh
+[ $# -lt 3 ] && echo "Usage: $0 <input-text> <outdir> <DICT> <suffix>" && exit 0
+
+if [ ! -d ${HOME}/sentencepiece ]; then
+    CURRENT_DIR=`pwd`
+    cd ${HOME}
+    git clone https://github.com/google/sentencepiece.git
+    cd sentencepiece
+    mkdir build && cd build
+    cmake .. && make -j 16
+    sudo make install
+    sudo ldconfig -v
+    cd ${HOME}
+    cd ${CURRENT_DIR}
+fi
+
+input=$1
+outdir=$2
+DICT=$3
+suffix=$4
+outname=${input##*/}
+outname=${outname%.txt*}
+[ -z $input ] && echo "You must specify a source file" && exit 1
+
+[ -z $DICT ] && echo "No dict was specified!" && exit 1
+[ -z $outdir ] && outdir=${input%/*}
+[ -z $outdir ] && outdir="."
+[ ! -d $outdir ] && mkdir -p $outdir
+
+echo "Dict  : $DICT"
+echo "------------------------------- creating idx/bin--------------------------------------------"
+echo "$input --> $outdir/${outname}${suffix}.idx"
+fairseq-preprocess \
+  --only-source \
+  --trainpref $input \
+  --destdir $outdir \
+  --thresholdsrc 0 \
+  --srcdict ${DICT} \
+  --workers 40
+
+mv $outdir/train.idx $outdir/${outname}${suffix}.idx
+mv $outdir/train.bin $outdir/${outname}${suffix}.bin
+echo "-----------------------------------   done      --------------------------------------------"
+
--- a/Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh
+++ b/Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh
+[ $# -lt 2 ] && echo "Usage: $0 <input-text> <outdir> <MODEL> <suffix>" && exit 0
+
+if [ ! -d ${HOME}/sentencepiece ]; then
+    CURRENT_DIR=`pwd`
+    cd ${HOME}
+    git clone https://github.com/google/sentencepiece.git
+    cd sentencepiece
+    mkdir build && cd build
+    cmake .. && make -j 16
+    sudo make install
+    sudo ldconfig -v
+    cd ${HOME}
+    cd ${CURRENT_DIR}
+fi
+
+input=$1
+outdir=$2
+MODEL=$3
+suffix=$4
+outname=${input##*/}
+outname=${outname%.wrd*}
+[ -z $input ] && echo "You must specify a source file" && exit 1
+
+[ -z $MODEL ] && MODEL=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/spm_unigram_10000.model && echo "No spm model was specified!, set default to $MODEL"
+[ -z $outdir ] && outdir=${input%/*}
+[ -z $outdir ] && outdir="."
+[ ! -d $outdir ] && mkdir -p $outdir
+
+echo "Output: $outdir/$outname.spm"
+
+echo "------------------------------- tokenize text...--------------------------------------------"
+spm_encode --model=$MODEL < ${input} > $outdir/$outname.spm || exit 1
+echo "-----------------------------------   done      --------------------------------------------"