init

12c90639 · “change” · 417b607b · 12c90639 · 12c90639 · 12c90639
Commit 12c90639 authored Sep 28, 2024 by “change”
20 changed files
--- a/Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune960h_large_edctc.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune960h_large_edctc.sh
+# ####################################
+# SpeechUT Large model #
+# ####################################
+[ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=3]" && exit 1
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+w2v_path=$1
+DATA_DIR=$2
+cpt=$3
+mount=$4
+world_size=$5
+update_freq=$6
+[ -z $mount ] && mount=${PWD}
+[ -z $world_size ] && world_size=8
+[ -z $update_freq ] && update_freq=3
+CODE_ROOT=${PWD}
+exp_name=${w2v_path%/*}
+exp_name=${exp_name##*/}
+MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5"
+[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
+python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
+  --config-dir $CODE_ROOT/speechut/config/finetune_asr \
+  --config-name speechut_large_960h \
+  common.user_dir=$CODE_ROOT/speechut \
+  \
+  task.data=$DATA_DIR \
+  task.label_dir=$DATA_DIR \
+  model.w2v_path=${w2v_path} \
+  \
+  optimization.lr=[0.00001] \
+  optimization.max_update=80000 \
+  dataset.max_tokens=1100000 \
+  optimization.update_freq=[${update_freq}] \
+  distributed_training.distributed_world_size=${world_size} \
+  \
+  dataset.train_subset="train_960" \
+  dataset.valid_subset="dev_other" \
+  \
+  common.tensorboard_logdir=$MODEL_DIR \
+  checkpoint.save_dir=$MODEL_DIR \
+  hydra.run.dir=$MODEL_DIR \
+  hydra.job.name=960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5
--- a/Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune_base_edctc.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_asr/finetune_base_edctc.sh
+# ####################################
+# SpeechUT Base model #
+# ####################################
+[ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=2]" && exit 1
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+w2v_path=$1
+DATA_DIR=$2
+cpt=$3
+mount=$4
+world_size=$5
+update_freq=$6
+[ -z $mount ] && mount=${PWD}
+[ -z $world_size ] && world_size=8
+[ -z $update_freq ] && update_freq=2
+CODE_ROOT=${PWD}
+exp_name=${w2v_path%/*}
+exp_name=${exp_name##*/}
+MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/edctc40k_from_${cpt}_bz2.6m_lr1e-5"
+[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
+python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
+  --config-dir $CODE_ROOT/speechut/config/finetune_asr \
+  --config-name speechut_base_100h \
+  common.user_dir=$CODE_ROOT/speechut \
+  \
+  task.data=$DATA_DIR \
+  task.label_dir=$DATA_DIR \
+  model.w2v_path=${w2v_path} \
+  \
+  optimization.lr=[0.00001] \
+  optimization.max_update=40000 \
+  dataset.max_tokens=1300000 \
+  optimization.update_freq=[${update_freq}] \
+  distributed_training.distributed_world_size=${world_size} \
+  \
+  dataset.train_subset="train_clean_100" \
+  dataset.valid_subset="dev_other" \
+  \
+  common.tensorboard_logdir=$MODEL_DIR \
+  checkpoint.save_dir=$MODEL_DIR \
+  hydra.run.dir=$MODEL_DIR \
+  hydra.job.name=edctc40k_from_${cpt}_bz2.6m_lr1e-5
--- a/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_edctc.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_edctc.sh
+#####################################
+# SpeechUT ASR model #
+#####################################
+[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [--normalize]" && exit 1
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+model_path=$1
+DATA_DIR=$2
+gen_set=$3
+beam_size=$4
+ctc_weight=$5
+extra=$6
+[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
+[ -z $gen_set ] && gen_set="dev_other"
+[ -z $beam_size ] && beam_size=10
+[ -z $ctc_weight ] && ctc_weight=0.2
+[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1
+[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
+src_dir=${model_path%/*}
+cpt=${model_path##*/}
+cpt=${cpt%.*}
+CODE_ROOT=${PWD}
+for subset in ${gen_set//,/ }; do
+    results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
+    [ ! -d $results_path ] && mkdir -p $results_path
+    python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
+    --user-dir $CODE_ROOT/speechut \
+    --label-dir ${DATA_DIR} \
+    --labels '["ltr"]' \
+    --single-target \
+    --post-process letter \
+    --gen-subset ${subset} \
+    --max-tokens 2000000 \
+    \
+    --task joint_sc2t_pretraining \
+    --add-decoder-target \
+    --fine-tuning \
+    --pad-audio \
+    --random-crop \
+    \
+    --ctc-weight ${ctc_weight} $extra \
+    --beam ${beam_size} \
+    \
+    --path ${model_path} \
+    --results-path $results_path \
+    \
+    --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
+    &
+done
+wait
+for subset in ${gen_set//,/ }; do
+    results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
+    echo $results_path
+    tail -n 1 $results_path/generate-*.txt
+done
--- a/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_edctclm.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_edctclm.sh
+#####################################
+# SpeechUT ASR model #
+#####################################
+[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=30] [ctc_weight=0.3] [lm_weight=0.7] [lm_path] [--normalize]" && exit 1
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+model_path=$1
+DATA_DIR=$2
+gen_set=$3
+beam_size=$4
+ctc_weight=$5
+lm_weight=$6
+lm_path=$7
+extra=$8
+[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
+[ -z $gen_set ] && gen_set="dev_other"
+[ -z $beam_size ] && beam_size=30
+[ -z $ctc_weight ] && ctc_weight=0.3
+[ -z $lm_weight ] && lm_weight=0.7
+[ -z $lm_path ] && lm_path="/mnt/default/v-junyiao/librispeech/lm/lm_ctc_form/checkpoint_best.pt"
+[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 and lm_weight to 0 as no ctc-decoding used..." && beam_size=1 && lm_weight=0
+[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
+src_dir=${model_path%/*}
+cpt=${model_path##*/}
+cpt=${cpt%.*}
+CODE_ROOT=${PWD}
+for subset in ${gen_set//,/ }; do
+    results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
+    [ ! -d $results_path ] && mkdir -p $results_path
+    python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
+    --user-dir $CODE_ROOT/speechut \
+    --label-dir ${DATA_DIR} \
+    --labels '["ltr"]' \
+    --single-target \
+    --post-process letter \
+    --gen-subset ${subset} \
+    --max-tokens 800000 \
+    \
+    --task joint_sc2t_pretraining \
+    --add-decoder-target \
+    --fine-tuning \
+    --pad-audio \
+    --random-crop \
+    \
+    --ctc-weight ${ctc_weight} $extra \
+    --lm-weight ${lm_weight} --lm-path ${lm_path} \
+    --beam ${beam_size} \
+    \
+    --path ${model_path} \
+    --results-path ${results_path} \
+    \
+    --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
+    &
+done
+wait
+for subset in ${gen_set//,/ }; do
+    results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
+    echo $results_path
+    tail -n 1 $results_path/generate-*.txt
+done
--- a/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_lm_nj.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_lm_nj.sh
+#####################################
+# SpeechUT ASR model #
+#####################################
+[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=30] [ctc_weight=0.3] [lm_weight=0.7] [lm_path] [nj=8] [ngpu=8] [--normalize]" && exit 1
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+model_path=$1
+DATA_DIR=$2
+gen_set=$3
+beam_size=$4
+ctc_weight=$5
+lm_weight=$6
+lm_path=$7
+nj=$8
+ngpu=$9
+extra=${10}
+[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
+[ -z $gen_set ] && gen_set="dev_other"
+[ -z $beam_size ] && beam_size=30
+[ -z $ctc_weight ] && ctc_weight=0.3
+[ -z $lm_weight ] && lm_weight=0.7
+[ -z $lm_path ] && lm_path="/mnt/default/v-junyiao/librispeech/lm/lm_ctc_form/checkpoint_best.pt"
+[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 and lm_weight to 0 as no ctc-decoding used..." && beam_size=1 && lm_weight=0
+[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
+[ -z $nj ] && nj=8
+[ -z $ngpu ] && ngpu=8
+src_dir=${model_path%/*}
+cpt=${model_path##*/}
+cpt=${cpt%.*}
+CODE_ROOT=${PWD}
+world_size=$nj
+for rank in $(seq 0 $((nj - 1))); do
+    export CUDA_VISIBLE_DEVICES=$((rank % $ngpu))
+    for subset in ${gen_set//,/ }; do
+        results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
+        [ ! -d $results_path ] && mkdir -p $results_path
+        python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
+        --user-dir $CODE_ROOT/speechut \
+        --label-dir ${DATA_DIR} \
+        --labels '["ltr"]' \
+        --single-target \
+        --post-process letter \
+        --gen-subset ${subset} \
+        --max-tokens 800000 \
+        \
+        --task joint_sc2t_pretraining \
+        --add-decoder-target \
+        --fine-tuning \
+        --pad-audio \
+        --random-crop \
+        \
+        --ctc-weight ${ctc_weight} $extra \
+        --lm-weight ${lm_weight} --lm-path ${lm_path} \
+        --beam ${beam_size} \
+        \
+        --path ${model_path} \
+        --results-path $results_path \
+        \
+        --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
+        --distributed-world-size ${world_size} --distributed-rank ${rank} \
+        &
+    done
+done
+wait
+for subset in ${gen_set//,/ }; do
+    results_dir=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}
+    cat $results_dir/${subset}_${world_size}_*/generate-${subset}.txt | grep -v "^Generate" > $results_dir/generate-${subset}.all.txt
+done
--- a/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_nj.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_asr/inference_nj.sh
+#####################################
+# SpeechUT ASR model #
+#####################################
+[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [nj=32] [ngpu=8] [--normalize]" && exit 1
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+model_path=$1
+DATA_DIR=$2
+gen_set=$3
+beam_size=$4
+ctc_weight=$5
+nj=$6
+ngpu=$7
+extra=$8
+[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
+[ -z $gen_set ] && gen_set="dev_other"
+[ -z $beam_size ] && beam_size=10
+[ -z $ctc_weight ] && ctc_weight=0.2
+[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1
+[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
+[ -z $nj ] && nj=32
+[ -z $ngpu ] && ngpu=8
+src_dir=${model_path%/*}
+cpt=${model_path##*/}
+cpt=${cpt%.*}
+CODE_ROOT=${PWD}
+world_size=$nj
+for rank in $(seq 0 $((nj - 1))); do
+    export CUDA_VISIBLE_DEVICES=$((rank % $ngpu))
+    for subset in ${gen_set//,/ }; do
+        results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
+        [ ! -d $results_path ] && mkdir -p $results_path
+        python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
+        --user-dir $CODE_ROOT/speechut \
+        --label-dir ${DATA_DIR} \
+        --labels '["ltr"]' \
+        --single-target \
+        --post-process letter \
+        --gen-subset ${subset} \
+        --max-tokens 2000000 \
+        \
+        --task joint_sc2t_pretraining \
+        --add-decoder-target \
+        --fine-tuning \
+        --pad-audio \
+        --random-crop \
+        \
+        --ctc-weight ${ctc_weight} $extra \
+        --beam ${beam_size} \
+        \
+        --path ${model_path} \
+        --results-path $results_path \
+        \
+        --scoring wer --max-len-a 0.00078125 --max-len-b 200 \
+        --distributed-world-size ${world_size} --distributed-rank ${rank} \
+        &
+    done
+done
+wait
+for subset in ${gen_set//,/ }; do
+    results_dir=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}
+    cat $results_dir/${subset}_${world_size}_*/generate-${subset}.txt | grep -v "^Generate" > $results_dir/generate-${subset}.all.txt
+done
--- a/Speech2S/speech2s/scripts copy/tune_speechut_st/finetune_base_mustc_enxx.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_st/finetune_base_mustc_enxx.sh
+# ####################################
+# SpeechUT Base model #
+# ####################################
+[ $# -lt 4 ] && echo "Usage: $0 <model_path> <data_dir> <lang> <cpt-tag> [mount=${PWD}] [world_size=8] [update_freq=4/6]" && exit 0
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+w2v_path=$1
+DATA_DIR=$2
+lang=$3
+cpt=$4
+mount=$5
+world_size=$6
+update_freq=$7
+[ -z $mount ] && mount=${PWD}
+[ -z $world_size ] && world_size=8
+[ -z $update_freq ] && update_freq=4
+CODE_ROOT=${PWD}
+exp_name=${w2v_path%/*}
+exp_name=${exp_name##*/}
+MODEL_DIR="$mount/exp/finetune_mustc/$exp_name/legacy_en${lang}_from_${cpt}_bz3.2m_lr3e-5"
+[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
+max_tokens=800000
+python $CODE_ROOT/fairseq/fairseq_cli/train.py ${DATA_DIR} \
+    --save-dir ${MODEL_DIR} \
+    --user-dir $CODE_ROOT/speechut \
+    --task speech_to_text \
+    --config-yaml config_en${lang}.yaml \
+    --train-subset "train_st" \
+    --valid-subset "dev_st" \
+    --fp16 \
+    --seed 1 \
+    \
+    --ddp-backend no_c10d \
+    --distributed-world-size ${world_size} \
+    --tensorboard-logdir ${MODEL_DIR} \
+    \
+    --criterion label_smoothed_cross_entropy --report-accuracy \
+    --label-smoothing 0.3 \
+    \
+    --optimizer adam \
+    --clip-norm 1.0 \
+    --lr 3e-05 \
+    --lr-scheduler polynomial_decay --warmup-updates 5000 \
+    --max-update 50000 \
+    --total-num-update 50000 \
+    --update-freq ${update_freq} \
+    \
+    --max-tokens ${max_tokens} \
+    --max-sentences 16 \
+    --max-tokens-valid ${max_tokens} \
+    --grouped-shuffling \
+    --max-source-positions ${max_tokens} \
+    --skip-invalid-size-inputs-valid-test \
+    --num-workers 0 \
+    --best-checkpoint-metric "accuracy" \
+    --maximize-best-checkpoint-metric \
+    \
+    --arch "speechut_st_legacy" \
+    --w2v-path ${w2v_path} \
+    --layerdrop 0.1 \
+    --activation-dropout 0.1 \
+    --attention-dropout 0.1 \
+    --feature-grad-mult 1.0 \
+    \
+    --apply-mask --mask-prob 0.5 \
+    \
+    --log-format json \
+    --log-interval 100 \
+    --save-interval 1 \
+    --keep-last-epochs 5 \
+    --keep-best-checkpoints 5 \
+    \
+    2>&1 | tee ${MODEL_DIR}/train_en${lang}.log
--- a/Speech2S/speech2s/scripts copy/tune_speechut_st/inference_st.sh
+++ b/Speech2S/speech2s/scripts copy/tune_speechut_st/inference_st.sh
+# ####################################
+# SpeechUT Base model #
+# ####################################
+[ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <lang> [gen-set=dev] [beam_size=10] [lenpen=1.0]" && exit 0
+[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
+model_path=$1
+DATA_DIR=$2
+lang=$3
+gen_set=$4
+beam_size=$5
+lenpen=$6
+[ -z $gen_set ] && gen_set="dev"
+[ -z $beam_size ] && beam_size=10
+[ -z $lenpen ] && lenpen=1
+src_dir=${model_path%/*}
+cpt=${model_path##*/}
+cpt=${cpt%.*}
+CODE_ROOT=${PWD}
+results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set}
+[ ! -d $results_path ] && mkdir -p $results_path
+python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
+    --gen-subset ${gen_set}_st \
+    --max-tokens 2000000 \
+    --max-source-positions 2000000 \
+    --num-workers 0 \
+    \
+    --user-dir $CODE_ROOT/speechut \
+    --task speech_to_text \
+    --config-yaml config_en${lang}.yaml \
+    \
+    --path ${model_path} \
+    --results-path $results_path \
+    \
+    --scoring sacrebleu --max-len-a 0 --max-len-b 512 \
+    --beam ${beam_size} \
+    --lenpen $lenpen \
+    # --model-overrides "{'model':{'w2v_path':'/path/to/your/pretrained/model.pt'}}" \
+    echo $results_path
+    tail -n 1 $results_path/generate-*.txt
+    sleep 1s
--- a/Speech2S/speech2s/scripts/__init__.py
+++ b/Speech2S/speech2s/scripts/__init__.py
--- a/Speech2S/speech2s/scripts/average_checkpoints.py
+++ b/Speech2S/speech2s/scripts/average_checkpoints.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import collections
+import os
+import re
+import torch
+from fairseq.file_io import PathManager
+def average_checkpoints(inputs):
+    """Loads checkpoints from inputs and returns a model with averaged weights.
+    Args:
+      inputs: An iterable of string paths of checkpoints to load from.
+    Returns:
+      A dict of string keys mapping to various values. The 'model' key
+      from the returned dict should correspond to an OrderedDict mapping
+      string parameter names to torch Tensors.
+    """
+    params_dict = collections.OrderedDict()
+    params_keys = None
+    new_state = None
+    num_models = len(inputs)
+    for fpath in inputs:
+        with PathManager.open(fpath, "rb") as f:
+            state = torch.load(
+                f,
+                map_location=(
+                    lambda s, _: torch.serialization.default_restore_location(s, "cpu")
+                ),
+            )
+        # Copies over the settings from the first checkpoint
+        if new_state is None:
+            new_state = state
+        model_params = state["model"]
+        model_params_keys = list(model_params.keys())
+        if params_keys is None:
+            params_keys = model_params_keys
+        elif params_keys != model_params_keys:
+            raise KeyError(
+                "For checkpoint {}, expected list of params: {}, "
+                "but found: {}".format(f, params_keys, model_params_keys)
+            )
+        for k in params_keys:
+            p = model_params[k]
+            if isinstance(p, torch.HalfTensor):
+                p = p.float()
+            if k not in params_dict:
+                params_dict[k] = p.clone()
+                # NOTE: clone() is needed in case of p is a shared parameter
+            else:
+                params_dict[k] += p
+    averaged_params = collections.OrderedDict()
+    for k, v in params_dict.items():
+        averaged_params[k] = v
+        if averaged_params[k].is_floating_point():
+            averaged_params[k].div_(num_models)
+        else:
+            averaged_params[k] //= num_models
+    new_state["model"] = averaged_params
+    return new_state
+def last_n_checkpoints(paths, n, update_based, upper_bound=None):
+    assert len(paths) == 1
+    path = paths[0]
+    if update_based:
+        pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
+    else:
+        pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
+    files = PathManager.ls(path)
+    entries = []
+    for f in files:
+        m = pt_regexp.fullmatch(f)
+        if m is not None:
+            sort_key = int(m.group(1))
+            if upper_bound is None or sort_key <= upper_bound:
+                entries.append((sort_key, m.group(0)))
+    if len(entries) < n:
+        raise Exception(
+            "Found {} checkpoint files but need at least {}", len(entries), n
+        )
+    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
+def main():
+    parser = argparse.ArgumentParser(
+        description="Tool to average the params of input checkpoints to "
+        "produce a new checkpoint",
+    )
+    # fmt: off
+    parser.add_argument('--inputs', required=True, nargs='+',
+                        help='Input checkpoint file paths.')
+    parser.add_argument('--output', required=True, metavar='FILE',
+                        help='Write the new checkpoint containing the averaged weights to this path.')
+    num_group = parser.add_mutually_exclusive_group()
+    num_group.add_argument('--num-epoch-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the '
+                           'path specified by input, and average last this many of them.')
+    num_group.add_argument('--num-update-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by'
+                           ' input, and average last this many of them.')
+    parser.add_argument('--checkpoint-upper-bound', type=int,
+                        help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
+                        'when using --num-update-checkpoints, this will set an upper bound on which update to use'
+                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be'
+                        ' averaged.'
+                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would'
+                        ' be averaged assuming --save-interval-updates 500'
+                        )
+    # fmt: on
+    args = parser.parse_args()
+    print(args)
+    num = None
+    is_update_based = False
+    if args.num_update_checkpoints is not None:
+        num = args.num_update_checkpoints
+        is_update_based = True
+    elif args.num_epoch_checkpoints is not None:
+        num = args.num_epoch_checkpoints
+    assert args.checkpoint_upper_bound is None or (
+        args.num_epoch_checkpoints is not None
+        or args.num_update_checkpoints is not None
+    ), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
+    assert (
+        args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
+    ), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"
+    if num is not None:
+        args.inputs = last_n_checkpoints(
+            args.inputs,
+            num,
+            is_update_based,
+            upper_bound=args.checkpoint_upper_bound,
+        )
+        print("averaging checkpoints: ", args.inputs)
+    new_state = average_checkpoints(args.inputs)
+    with PathManager.open(args.output, "wb") as f:
+        torch.save(new_state, f)
+    print("Finished writing averaged checkpoint to {}".format(args.output))
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/build_sym_alignment.py
+++ b/Speech2S/speech2s/scripts/build_sym_alignment.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Use this script in order to build symmetric alignments for your translation
+dataset.
+This script depends on fast_align and mosesdecoder tools. You will need to
+build those before running the script.
+fast_align:
+    github: http://github.com/clab/fast_align
+    instructions: follow the instructions in README.md
+mosesdecoder:
+    github: http://github.com/moses-smt/mosesdecoder
+    instructions: http://www.statmt.org/moses/?n=Development.GetStarted
+The script produces the following files under --output_dir:
+    text.joined - concatenation of lines from the source_file and the
+    target_file.
+    align.forward - forward pass of fast_align.
+    align.backward - backward pass of fast_align.
+    aligned.sym_heuristic - symmetrized alignment.
+"""
+import argparse
+import os
+from itertools import zip_longest
+def main():
+    parser = argparse.ArgumentParser(description="symmetric alignment builer")
+    # fmt: off
+    parser.add_argument('--fast_align_dir',
+                        help='path to fast_align build directory')
+    parser.add_argument('--mosesdecoder_dir',
+                        help='path to mosesdecoder root directory')
+    parser.add_argument('--sym_heuristic',
+                        help='heuristic to use for symmetrization',
+                        default='grow-diag-final-and')
+    parser.add_argument('--source_file',
+                        help='path to a file with sentences '
+                             'in the source language')
+    parser.add_argument('--target_file',
+                        help='path to a file with sentences '
+                             'in the target language')
+    parser.add_argument('--output_dir',
+                        help='output directory')
+    # fmt: on
+    args = parser.parse_args()
+    fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
+    symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
+    sym_fast_align_bin = os.path.join(
+        args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
+    )
+    # create joined file
+    joined_file = os.path.join(args.output_dir, "text.joined")
+    with open(args.source_file, "r", encoding="utf-8") as src, open(
+        args.target_file, "r", encoding="utf-8"
+    ) as tgt:
+        with open(joined_file, "w", encoding="utf-8") as joined:
+            for s, t in zip_longest(src, tgt):
+                print("{} ||| {}".format(s.strip(), t.strip()), file=joined)
+    bwd_align_file = os.path.join(args.output_dir, "align.backward")
+    # run forward alignment
+    fwd_align_file = os.path.join(args.output_dir, "align.forward")
+    fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
+        FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
+    )
+    assert os.system(fwd_fast_align_cmd) == 0
+    # run backward alignment
+    bwd_align_file = os.path.join(args.output_dir, "align.backward")
+    bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
+        FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
+    )
+    assert os.system(bwd_fast_align_cmd) == 0
+    # run symmetrization
+    sym_out_file = os.path.join(args.output_dir, "aligned")
+    sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
+        SYMFASTALIGN=sym_fast_align_bin,
+        FWD=fwd_align_file,
+        BWD=bwd_align_file,
+        SRC=args.source_file,
+        TGT=args.target_file,
+        OUT=sym_out_file,
+        HEURISTIC=args.sym_heuristic,
+        SYMAL=symal_bin,
+    )
+    assert os.system(sym_cmd) == 0
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/compare_namespaces.py
+++ b/Speech2S/speech2s/scripts/compare_namespaces.py
+#!/usr/bin/env python
+"""Helper script to compare two argparse.Namespace objects."""
+from argparse import Namespace  # noqa
+def main():
+    ns1 = eval(input("Namespace 1: "))
+    ns2 = eval(input("Namespace 2: "))
+    def keys(ns):
+        ks = set()
+        for k in dir(ns):
+            if not k.startswith("_"):
+                ks.add(k)
+        return ks
+    k1 = keys(ns1)
+    k2 = keys(ns2)
+    def print_keys(ks, ns1, ns2=None):
+        for k in ks:
+            if ns2 is None:
+                print("{}\t{}".format(k, getattr(ns1, k, None)))
+            else:
+                print(
+                    "{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None))
+                )
+    print("Keys unique to namespace 1:")
+    print_keys(k1 - k2, ns1)
+    print()
+    print("Keys unique to namespace 2:")
+    print_keys(k2 - k1, ns2)
+    print()
+    print("Overlapping keys with different values:")
+    ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")]
+    print_keys(ks, ns1, ns2)
+    print()
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/compound_split_bleu.sh
+++ b/Speech2S/speech2s/scripts/compound_split_bleu.sh
+#!/bin/bash
+if [ $# -ne 1 ]; then
+    echo "usage: $0 GENERATE_PY_OUTPUT"
+    exit 1
+fi
+GEN=$1
+SYS=$GEN.sys
+REF=$GEN.ref
+if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then
+    echo "not done generating"
+    exit
+fi
+grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS
+grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF
+fairseq-score --sys $SYS --ref $REF
--- a/Speech2S/speech2s/scripts/constraints/extract.py
+++ b/Speech2S/speech2s/scripts/constraints/extract.py
+#!/usr/bin/env python3
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Extracts random constraints from reference files."""
+import argparse
+import random
+import sys
+def get_phrase(words, index, length):
+    assert index < len(words) - length + 1
+    phr = " ".join(words[index : index + length])
+    for i in range(index, index + length):
+        words.pop(index)
+    return phr
+def main(args):
+    if args.seed:
+        random.seed(args.seed)
+    for line in sys.stdin:
+        constraints = []
+        def add_constraint(constraint):
+            constraints.append(constraint)
+        source = line.rstrip()
+        if "\t" in line:
+            source, target = line.split("\t")
+            if args.add_sos:
+                target = f"<s> {target}"
+            if args.add_eos:
+                target = f"{target} </s>"
+            if len(target.split()) >= args.len:
+                words = [target]
+                num = args.number
+                choices = {}
+                for i in range(num):
+                    if len(words) == 0:
+                        break
+                    segmentno = random.choice(range(len(words)))
+                    segment = words.pop(segmentno)
+                    tokens = segment.split()
+                    phrase_index = random.choice(range(len(tokens)))
+                    choice = " ".join(
+                        tokens[phrase_index : min(len(tokens), phrase_index + args.len)]
+                    )
+                    for j in range(
+                        phrase_index, min(len(tokens), phrase_index + args.len)
+                    ):
+                        tokens.pop(phrase_index)
+                    if phrase_index > 0:
+                        words.append(" ".join(tokens[0:phrase_index]))
+                    if phrase_index + 1 < len(tokens):
+                        words.append(" ".join(tokens[phrase_index:]))
+                    choices[target.find(choice)] = choice
+                    # mask out with spaces
+                    target = target.replace(choice, " " * len(choice), 1)
+                for key in sorted(choices.keys()):
+                    add_constraint(choices[key])
+        print(source, *constraints, sep="\t")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--number", "-n", type=int, default=1, help="number of phrases")
+    parser.add_argument("--len", "-l", type=int, default=1, help="phrase length")
+    parser.add_argument(
+        "--add-sos", default=False, action="store_true", help="add <s> token"
+    )
+    parser.add_argument(
+        "--add-eos", default=False, action="store_true", help="add </s> token"
+    )
+    parser.add_argument("--seed", "-s", default=0, type=int)
+    args = parser.parse_args()
+    main(args)
--- a/Speech2S/speech2s/scripts/constraints/validate.py
+++ b/Speech2S/speech2s/scripts/constraints/validate.py
+#!/usr/bin/env python3
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+"""Reads in a fairseq output file, and verifies that the constraints
+(C- lines) are present in the output (the first H- line). Assumes that
+constraints are listed prior to the first hypothesis.
+"""
+constraints = []
+found = 0
+total = 0
+for line in sys.stdin:
+    if line.startswith("C-"):
+        constraints.append(line.rstrip().split("\t")[1])
+    elif line.startswith("H-"):
+        text = line.split("\t")[2]
+        for constraint in constraints:
+            total += 1
+            if constraint in text:
+                found += 1
+            else:
+                print(f"No {constraint} in {text}", file=sys.stderr)
+        constraints = []
+print(f"Found {found} / {total} = {100 * found / total:.1f}%")
--- a/Speech2S/speech2s/scripts/convert_dictionary.lua
+++ b/Speech2S/speech2s/scripts/convert_dictionary.lua
+-- Copyright (c) Facebook, Inc. and its affiliates.
+--
+-- This source code is licensed under the MIT license found in the
+-- LICENSE file in the root directory of this source tree.
+--
+-- Usage: convert_dictionary.lua <dict.th7>
+require 'fairseq'
+require 'torch'
+require 'paths'
+if #arg < 1 then
+   print('usage: convert_dictionary.lua <dict.th7>')
+   os.exit(1)
+end
+if not paths.filep(arg[1]) then
+   print('error: file does not exit: ' .. arg[1])
+   os.exit(1)
+end
+dict = torch.load(arg[1])
+dst = paths.basename(arg[1]):gsub('.th7', '.txt')
+assert(dst:match('.txt$'))
+f = io.open(dst, 'w')
+for idx, symbol in ipairs(dict.index_to_symbol) do
+  if idx > dict.cutoff then
+    break
+  end
+  f:write(symbol)
+  f:write(' ')
+  f:write(dict.index_to_freq[idx])
+  f:write('\n')
+end
+f:close()
--- a/Speech2S/speech2s/scripts/convert_model.lua
+++ b/Speech2S/speech2s/scripts/convert_model.lua
+-- Copyright (c) Facebook, Inc. and its affiliates.
+--
+-- This source code is licensed under the MIT license found in the
+-- LICENSE file in the root directory of this source tree.
+--
+-- Usage: convert_model.lua <model_epoch1.th7>
+require 'torch'
+local fairseq = require 'fairseq'
+model = torch.load(arg[1])
+function find_weight_norm(container, module)
+  for _, wn in ipairs(container:listModules()) do
+    if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then
+      return wn
+    end
+  end
+end
+function push_state(dict, key, module)
+  if torch.type(module) == 'nn.Linear' then
+    local wn = find_weight_norm(model.module, module)
+    assert(wn)
+    dict[key .. '.weight_v'] = wn.v:float()
+    dict[key .. '.weight_g'] = wn.g:float()
+  elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then
+    local wn = find_weight_norm(model.module, module)
+    assert(wn)
+    local v = wn.v:float():view(wn.viewOut):transpose(2, 3)
+    dict[key .. '.weight_v'] = v
+    dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1)
+  else
+    dict[key .. '.weight'] = module.weight:float()
+  end
+  if module.bias then
+    dict[key .. '.bias'] = module.bias:float()
+  end
+end
+encoder_dict = {}
+decoder_dict = {}
+combined_dict = {}
+function encoder_state(encoder)
+  luts = encoder:findModules('nn.LookupTable')
+  push_state(encoder_dict, 'embed_tokens', luts[1])
+  push_state(encoder_dict, 'embed_positions', luts[2])
+  fcs = encoder:findModules('nn.Linear')
+  assert(#fcs >= 2)
+  local nInputPlane = fcs[1].weight:size(1)
+  push_state(encoder_dict, 'fc1', table.remove(fcs, 1))
+  push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs))
+  for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do
+    push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module)
+    if nInputPlane ~= module.weight:size(3) / 2 then
+      push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
+    end
+    nInputPlane = module.weight:size(3) / 2
+  end
+  assert(#fcs == 0)
+end
+function decoder_state(decoder)
+  luts = decoder:findModules('nn.LookupTable')
+  push_state(decoder_dict, 'embed_tokens', luts[1])
+  push_state(decoder_dict, 'embed_positions', luts[2])
+  fcs = decoder:findModules('nn.Linear')
+  local nInputPlane = fcs[1].weight:size(1)
+  push_state(decoder_dict, 'fc1', table.remove(fcs, 1))
+  push_state(decoder_dict, 'fc2', fcs[#fcs - 1])
+  push_state(decoder_dict, 'fc3', fcs[#fcs])
+  table.remove(fcs, #fcs)
+  table.remove(fcs, #fcs)
+  for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do
+    if nInputPlane ~= module.weight:size(3) / 2 then
+      push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
+    end
+    nInputPlane = module.weight:size(3) / 2
+    local prefix = 'attention.' .. tostring(i - 1)
+    push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1))
+    push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1))
+    push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module)
+  end
+  assert(#fcs == 0)
+end
+_encoder = model.module.modules[2]
+_decoder = model.module.modules[3]
+encoder_state(_encoder)
+decoder_state(_decoder)
+for k, v in pairs(encoder_dict) do
+  combined_dict['encoder.' .. k] = v
+end
+for k, v in pairs(decoder_dict) do
+  combined_dict['decoder.' .. k] = v
+end
+torch.save('state_dict.t7', combined_dict)
--- a/Speech2S/speech2s/scripts/count_docs.py
+++ b/Speech2S/speech2s/scripts/count_docs.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Count the number of documents and average number of lines and tokens per
+document in a large file. Documents should be separated by a single empty line.
+"""
+import argparse
+import gzip
+import sys
+import numpy as np
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("--gzip", action="store_true")
+    args = parser.parse_args()
+    def gopen():
+        if args.gzip:
+            return gzip.open(args.input, "r")
+        else:
+            return open(args.input, "r", encoding="utf-8")
+    num_lines = []
+    num_toks = []
+    with gopen() as h:
+        num_docs = 1
+        num_lines_in_doc = 0
+        num_toks_in_doc = 0
+        for i, line in enumerate(h):
+            if len(line.strip()) == 0:  # empty line indicates new document
+                num_docs += 1
+                num_lines.append(num_lines_in_doc)
+                num_toks.append(num_toks_in_doc)
+                num_lines_in_doc = 0
+                num_toks_in_doc = 0
+            else:
+                num_lines_in_doc += 1
+                num_toks_in_doc += len(line.rstrip().split())
+            if i % 1000000 == 0:
+                print(i, file=sys.stderr, end="", flush=True)
+            elif i % 100000 == 0:
+                print(".", file=sys.stderr, end="", flush=True)
+        print(file=sys.stderr, flush=True)
+    print("found {} docs".format(num_docs))
+    print("average num lines per doc: {}".format(np.mean(num_lines)))
+    print("average num toks per doc: {}".format(np.mean(num_toks)))
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/read_binarized.py
+++ b/Speech2S/speech2s/scripts/read_binarized.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from fairseq.data import Dictionary, data_utils, indexed_dataset
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="writes text from binarized file to stdout"
+    )
+    # fmt: off
+    parser.add_argument('--dataset-impl', help='dataset implementation',
+                        choices=indexed_dataset.get_available_dataset_impl())
+    parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
+    parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
+    # fmt: on
+    return parser
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    dictionary = Dictionary.load(args.dict) if args.dict is not None else None
+    dataset = data_utils.load_indexed_dataset(
+        args.input,
+        dictionary,
+        dataset_impl=args.dataset_impl,
+        default="lazy",
+    )
+    for tensor_line in dataset:
+        if dictionary is None:
+            line = " ".join([str(int(x)) for x in tensor_line])
+        else:
+            line = dictionary.string(tensor_line)
+        print(line)
+if __name__ == "__main__":
+    main()
--- a/Speech2S/speech2s/scripts/rm_pt.py
+++ b/Speech2S/speech2s/scripts/rm_pt.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import re
+import shutil
+import sys
+pt_regexp = re.compile(r"checkpoint(\d+|_\d+_\d+|_[a-z]+)\.pt")
+pt_regexp_epoch_based = re.compile(r"checkpoint(\d+)\.pt")
+pt_regexp_update_based = re.compile(r"checkpoint_\d+_(\d+)\.pt")
+def parse_checkpoints(files):
+    entries = []
+    for f in files:
+        m = pt_regexp_epoch_based.fullmatch(f)
+        if m is not None:
+            entries.append((int(m.group(1)), m.group(0)))
+        else:
+            m = pt_regexp_update_based.fullmatch(f)
+            if m is not None:
+                entries.append((int(m.group(1)), m.group(0)))
+    return entries
+def last_n_checkpoints(files, n):
+    entries = parse_checkpoints(files)
+    return [x[1] for x in sorted(entries, reverse=True)[:n]]
+def every_n_checkpoints(files, n):
+    entries = parse_checkpoints(files)
+    return [x[1] for x in sorted(sorted(entries)[::-n])]
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Recursively delete checkpoint files from `root_dir`, "
+            "but preserve checkpoint_best.pt and checkpoint_last.pt"
+        )
+    )
+    parser.add_argument("root_dirs", nargs="*")
+    parser.add_argument(
+        "--save-last", type=int, default=0, help="number of last checkpoints to save"
+    )
+    parser.add_argument(
+        "--save-every", type=int, default=0, help="interval of checkpoints to save"
+    )
+    parser.add_argument(
+        "--preserve-test",
+        action="store_true",
+        help="preserve checkpoints in dirs that start with test_ prefix (default: delete them)",
+    )
+    parser.add_argument(
+        "--delete-best", action="store_true", help="delete checkpoint_best.pt"
+    )
+    parser.add_argument(
+        "--delete-last", action="store_true", help="delete checkpoint_last.pt"
+    )
+    parser.add_argument(
+        "--no-dereference", action="store_true", help="don't dereference symlinks"
+    )
+    args = parser.parse_args()
+    files_to_desymlink = []
+    files_to_preserve = []
+    files_to_delete = []
+    for root_dir in args.root_dirs:
+        for root, _subdirs, files in os.walk(root_dir):
+            if args.save_last > 0:
+                to_save = last_n_checkpoints(files, args.save_last)
+            else:
+                to_save = []
+            if args.save_every > 0:
+                to_save += every_n_checkpoints(files, args.save_every)
+            for file in files:
+                if not pt_regexp.fullmatch(file):
+                    continue
+                full_path = os.path.join(root, file)
+                if (
+                    not os.path.basename(root).startswith("test_") or args.preserve_test
+                ) and (
+                    (file == "checkpoint_last.pt" and not args.delete_last)
+                    or (file == "checkpoint_best.pt" and not args.delete_best)
+                    or file in to_save
+                ):
+                    if os.path.islink(full_path) and not args.no_dereference:
+                        files_to_desymlink.append(full_path)
+                    else:
+                        files_to_preserve.append(full_path)
+                else:
+                    files_to_delete.append(full_path)
+    if len(files_to_desymlink) == 0 and len(files_to_delete) == 0:
+        print("Nothing to do.")
+        sys.exit(0)
+    files_to_desymlink = sorted(files_to_desymlink)
+    files_to_preserve = sorted(files_to_preserve)
+    files_to_delete = sorted(files_to_delete)
+    print("Operations to perform (in order):")
+    if len(files_to_desymlink) > 0:
+        for file in files_to_desymlink:
+            print(" - preserve (and dereference symlink): " + file)
+    if len(files_to_preserve) > 0:
+        for file in files_to_preserve:
+            print(" - preserve: " + file)
+    if len(files_to_delete) > 0:
+        for file in files_to_delete:
+            print(" - delete: " + file)
+    while True:
+        resp = input("Continue? (Y/N): ")
+        if resp.strip().lower() == "y":
+            break
+        elif resp.strip().lower() == "n":
+            sys.exit(0)
+    print("Executing...")
+    if len(files_to_desymlink) > 0:
+        for file in files_to_desymlink:
+            realpath = os.path.realpath(file)
+            print("rm " + file)
+            os.remove(file)
+            print("cp {} {}".format(realpath, file))
+            shutil.copyfile(realpath, file)
+    if len(files_to_delete) > 0:
+        for file in files_to_delete:
+            print("rm " + file)
+            os.remove(file)
+if __name__ == "__main__":
+    main()