"vscode:/vscode.git/clone" did not exist on "f23f8a0688557e3ca3cf8bbf8e7669eab9912434"
Commit 12c90639 authored by “change”'s avatar “change”
Browse files

init

parent 417b607b
# ####################################
# SpeechUT Large model #
# ####################################
[ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=3]" && exit 1
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
w2v_path=$1
DATA_DIR=$2
cpt=$3
mount=$4
world_size=$5
update_freq=$6
[ -z $mount ] && mount=${PWD}
[ -z $world_size ] && world_size=8
[ -z $update_freq ] && update_freq=3
CODE_ROOT=${PWD}
exp_name=${w2v_path%/*}
exp_name=${exp_name##*/}
MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5"
[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
--config-dir $CODE_ROOT/speechut/config/finetune_asr \
--config-name speechut_large_960h \
common.user_dir=$CODE_ROOT/speechut \
\
task.data=$DATA_DIR \
task.label_dir=$DATA_DIR \
model.w2v_path=${w2v_path} \
\
optimization.lr=[0.00001] \
optimization.max_update=80000 \
dataset.max_tokens=1100000 \
optimization.update_freq=[${update_freq}] \
distributed_training.distributed_world_size=${world_size} \
\
dataset.train_subset="train_960" \
dataset.valid_subset="dev_other" \
\
common.tensorboard_logdir=$MODEL_DIR \
checkpoint.save_dir=$MODEL_DIR \
hydra.run.dir=$MODEL_DIR \
hydra.job.name=960h_edctc80k_from_${cpt}_bz3.3m_lr1e-5
# ####################################
# SpeechUT Base model #
# ####################################
[ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <cpt_tag> [mount=${PWD}] [world_size=8] [update_freq=2]" && exit 1
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
w2v_path=$1
DATA_DIR=$2
cpt=$3
mount=$4
world_size=$5
update_freq=$6
[ -z $mount ] && mount=${PWD}
[ -z $world_size ] && world_size=8
[ -z $update_freq ] && update_freq=2
CODE_ROOT=${PWD}
exp_name=${w2v_path%/*}
exp_name=${exp_name##*/}
MODEL_DIR="${mount}/exp/finetune_asr/$exp_name/edctc40k_from_${cpt}_bz2.6m_lr1e-5"
[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
python $CODE_ROOT/fairseq/fairseq_cli/hydra_train.py \
--config-dir $CODE_ROOT/speechut/config/finetune_asr \
--config-name speechut_base_100h \
common.user_dir=$CODE_ROOT/speechut \
\
task.data=$DATA_DIR \
task.label_dir=$DATA_DIR \
model.w2v_path=${w2v_path} \
\
optimization.lr=[0.00001] \
optimization.max_update=40000 \
dataset.max_tokens=1300000 \
optimization.update_freq=[${update_freq}] \
distributed_training.distributed_world_size=${world_size} \
\
dataset.train_subset="train_clean_100" \
dataset.valid_subset="dev_other" \
\
common.tensorboard_logdir=$MODEL_DIR \
checkpoint.save_dir=$MODEL_DIR \
hydra.run.dir=$MODEL_DIR \
hydra.job.name=edctc40k_from_${cpt}_bz2.6m_lr1e-5
#####################################
# SpeechUT ASR model #
#####################################
[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [--normalize]" && exit 1
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
model_path=$1
DATA_DIR=$2
gen_set=$3
beam_size=$4
ctc_weight=$5
extra=$6
[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
[ -z $gen_set ] && gen_set="dev_other"
[ -z $beam_size ] && beam_size=10
[ -z $ctc_weight ] && ctc_weight=0.2
[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1
[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
src_dir=${model_path%/*}
cpt=${model_path##*/}
cpt=${cpt%.*}
CODE_ROOT=${PWD}
for subset in ${gen_set//,/ }; do
results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
[ ! -d $results_path ] && mkdir -p $results_path
python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
--user-dir $CODE_ROOT/speechut \
--label-dir ${DATA_DIR} \
--labels '["ltr"]' \
--single-target \
--post-process letter \
--gen-subset ${subset} \
--max-tokens 2000000 \
\
--task joint_sc2t_pretraining \
--add-decoder-target \
--fine-tuning \
--pad-audio \
--random-crop \
\
--ctc-weight ${ctc_weight} $extra \
--beam ${beam_size} \
\
--path ${model_path} \
--results-path $results_path \
\
--scoring wer --max-len-a 0.00078125 --max-len-b 200 \
&
done
wait
for subset in ${gen_set//,/ }; do
results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
echo $results_path
tail -n 1 $results_path/generate-*.txt
done
#####################################
# SpeechUT ASR model #
#####################################
[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=30] [ctc_weight=0.3] [lm_weight=0.7] [lm_path] [--normalize]" && exit 1
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
model_path=$1
DATA_DIR=$2
gen_set=$3
beam_size=$4
ctc_weight=$5
lm_weight=$6
lm_path=$7
extra=$8
[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
[ -z $gen_set ] && gen_set="dev_other"
[ -z $beam_size ] && beam_size=30
[ -z $ctc_weight ] && ctc_weight=0.3
[ -z $lm_weight ] && lm_weight=0.7
[ -z $lm_path ] && lm_path="/mnt/default/v-junyiao/librispeech/lm/lm_ctc_form/checkpoint_best.pt"
[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 and lm_weight to 0 as no ctc-decoding used..." && beam_size=1 && lm_weight=0
[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
src_dir=${model_path%/*}
cpt=${model_path##*/}
cpt=${cpt%.*}
CODE_ROOT=${PWD}
for subset in ${gen_set//,/ }; do
results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
[ ! -d $results_path ] && mkdir -p $results_path
python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
--user-dir $CODE_ROOT/speechut \
--label-dir ${DATA_DIR} \
--labels '["ltr"]' \
--single-target \
--post-process letter \
--gen-subset ${subset} \
--max-tokens 800000 \
\
--task joint_sc2t_pretraining \
--add-decoder-target \
--fine-tuning \
--pad-audio \
--random-crop \
\
--ctc-weight ${ctc_weight} $extra \
--lm-weight ${lm_weight} --lm-path ${lm_path} \
--beam ${beam_size} \
\
--path ${model_path} \
--results-path ${results_path} \
\
--scoring wer --max-len-a 0.00078125 --max-len-b 200 \
&
done
wait
for subset in ${gen_set//,/ }; do
results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
echo $results_path
tail -n 1 $results_path/generate-*.txt
done
#####################################
# SpeechUT ASR model #
#####################################
[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=30] [ctc_weight=0.3] [lm_weight=0.7] [lm_path] [nj=8] [ngpu=8] [--normalize]" && exit 1
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
model_path=$1
DATA_DIR=$2
gen_set=$3
beam_size=$4
ctc_weight=$5
lm_weight=$6
lm_path=$7
nj=$8
ngpu=$9
extra=${10}
[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
[ -z $gen_set ] && gen_set="dev_other"
[ -z $beam_size ] && beam_size=30
[ -z $ctc_weight ] && ctc_weight=0.3
[ -z $lm_weight ] && lm_weight=0.7
[ -z $lm_path ] && lm_path="/mnt/default/v-junyiao/librispeech/lm/lm_ctc_form/checkpoint_best.pt"
[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 and lm_weight to 0 as no ctc-decoding used..." && beam_size=1 && lm_weight=0
[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
[ -z $nj ] && nj=8
[ -z $ngpu ] && ngpu=8
src_dir=${model_path%/*}
cpt=${model_path##*/}
cpt=${cpt%.*}
CODE_ROOT=${PWD}
world_size=$nj
for rank in $(seq 0 $((nj - 1))); do
export CUDA_VISIBLE_DEVICES=$((rank % $ngpu))
for subset in ${gen_set//,/ }; do
results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}/${subset}_${world_size}_${rank}
[ ! -d $results_path ] && mkdir -p $results_path
python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
--user-dir $CODE_ROOT/speechut \
--label-dir ${DATA_DIR} \
--labels '["ltr"]' \
--single-target \
--post-process letter \
--gen-subset ${subset} \
--max-tokens 800000 \
\
--task joint_sc2t_pretraining \
--add-decoder-target \
--fine-tuning \
--pad-audio \
--random-crop \
\
--ctc-weight ${ctc_weight} $extra \
--lm-weight ${lm_weight} --lm-path ${lm_path} \
--beam ${beam_size} \
\
--path ${model_path} \
--results-path $results_path \
\
--scoring wer --max-len-a 0.00078125 --max-len-b 200 \
--distributed-world-size ${world_size} --distributed-rank ${rank} \
&
done
done
wait
for subset in ${gen_set//,/ }; do
results_dir=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}_lm${lm_weight}
cat $results_dir/${subset}_${world_size}_*/generate-${subset}.txt | grep -v "^Generate" > $results_dir/generate-${subset}.all.txt
done
#####################################
# SpeechUT ASR model #
#####################################
[ $# -lt 2 ] && echo "Usage: $0 <model_path> <data_dir> [gen-set=dev_other] [beam_size=10] [ctc_weight=0.2] [nj=32] [ngpu=8] [--normalize]" && exit 1
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
model_path=$1
DATA_DIR=$2
gen_set=$3
beam_size=$4
ctc_weight=$5
nj=$6
ngpu=$7
extra=$8
[ -z $extra ] && echo "Assert decoding base model! If you are decoding large model, please add '--normalize' at the end..."
[ -z $gen_set ] && gen_set="dev_other"
[ -z $beam_size ] && beam_size=10
[ -z $ctc_weight ] && ctc_weight=0.2
[ $ctc_weight == 0 ] && [ $beam_size != 1 ] && echo "Change beam size to 1 as no ctc-decoding used..." && beam_size=1
[ $ctc_weight != 0 ] && extra="$extra --batch-size 1"
[ -z $nj ] && nj=32
[ -z $ngpu ] && ngpu=8
src_dir=${model_path%/*}
cpt=${model_path##*/}
cpt=${cpt%.*}
CODE_ROOT=${PWD}
world_size=$nj
for rank in $(seq 0 $((nj - 1))); do
export CUDA_VISIBLE_DEVICES=$((rank % $ngpu))
for subset in ${gen_set//,/ }; do
results_path=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}/${subset}_${world_size}_${rank}
[ ! -d $results_path ] && mkdir -p $results_path
python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
--user-dir $CODE_ROOT/speechut \
--label-dir ${DATA_DIR} \
--labels '["ltr"]' \
--single-target \
--post-process letter \
--gen-subset ${subset} \
--max-tokens 2000000 \
\
--task joint_sc2t_pretraining \
--add-decoder-target \
--fine-tuning \
--pad-audio \
--random-crop \
\
--ctc-weight ${ctc_weight} $extra \
--beam ${beam_size} \
\
--path ${model_path} \
--results-path $results_path \
\
--scoring wer --max-len-a 0.00078125 --max-len-b 200 \
--distributed-world-size ${world_size} --distributed-rank ${rank} \
&
done
done
wait
for subset in ${gen_set//,/ }; do
results_dir=$src_dir/decode_${cpt}/beam${beam_size}_ctc${ctc_weight}
cat $results_dir/${subset}_${world_size}_*/generate-${subset}.txt | grep -v "^Generate" > $results_dir/generate-${subset}.all.txt
done
# ####################################
# SpeechUT Base model #
# ####################################
[ $# -lt 4 ] && echo "Usage: $0 <model_path> <data_dir> <lang> <cpt-tag> [mount=${PWD}] [world_size=8] [update_freq=4/6]" && exit 0
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
w2v_path=$1
DATA_DIR=$2
lang=$3
cpt=$4
mount=$5
world_size=$6
update_freq=$7
[ -z $mount ] && mount=${PWD}
[ -z $world_size ] && world_size=8
[ -z $update_freq ] && update_freq=4
CODE_ROOT=${PWD}
exp_name=${w2v_path%/*}
exp_name=${exp_name##*/}
MODEL_DIR="$mount/exp/finetune_mustc/$exp_name/legacy_en${lang}_from_${cpt}_bz3.2m_lr3e-5"
[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
max_tokens=800000
python $CODE_ROOT/fairseq/fairseq_cli/train.py ${DATA_DIR} \
--save-dir ${MODEL_DIR} \
--user-dir $CODE_ROOT/speechut \
--task speech_to_text \
--config-yaml config_en${lang}.yaml \
--train-subset "train_st" \
--valid-subset "dev_st" \
--fp16 \
--seed 1 \
\
--ddp-backend no_c10d \
--distributed-world-size ${world_size} \
--tensorboard-logdir ${MODEL_DIR} \
\
--criterion label_smoothed_cross_entropy --report-accuracy \
--label-smoothing 0.3 \
\
--optimizer adam \
--clip-norm 1.0 \
--lr 3e-05 \
--lr-scheduler polynomial_decay --warmup-updates 5000 \
--max-update 50000 \
--total-num-update 50000 \
--update-freq ${update_freq} \
\
--max-tokens ${max_tokens} \
--max-sentences 16 \
--max-tokens-valid ${max_tokens} \
--grouped-shuffling \
--max-source-positions ${max_tokens} \
--skip-invalid-size-inputs-valid-test \
--num-workers 0 \
--best-checkpoint-metric "accuracy" \
--maximize-best-checkpoint-metric \
\
--arch "speechut_st_legacy" \
--w2v-path ${w2v_path} \
--layerdrop 0.1 \
--activation-dropout 0.1 \
--attention-dropout 0.1 \
--feature-grad-mult 1.0 \
\
--apply-mask --mask-prob 0.5 \
\
--log-format json \
--log-interval 100 \
--save-interval 1 \
--keep-last-epochs 5 \
--keep-best-checkpoints 5 \
\
2>&1 | tee ${MODEL_DIR}/train_en${lang}.log
# ####################################
# SpeechUT Base model #
# ####################################
[ $# -lt 3 ] && echo "Usage: $0 <model_path> <data_dir> <lang> [gen-set=dev] [beam_size=10] [lenpen=1.0]" && exit 0
[ ${PWD##*/} != SpeechUT ] && echo "Error: dir not match! Switch to SpeechUT/ and run it again!" && exit 1
model_path=$1
DATA_DIR=$2
lang=$3
gen_set=$4
beam_size=$5
lenpen=$6
[ -z $gen_set ] && gen_set="dev"
[ -z $beam_size ] && beam_size=10
[ -z $lenpen ] && lenpen=1
src_dir=${model_path%/*}
cpt=${model_path##*/}
cpt=${cpt%.*}
CODE_ROOT=${PWD}
results_path=$src_dir/decode_${cpt}_beam${beam_size}/${gen_set}
[ ! -d $results_path ] && mkdir -p $results_path
python $CODE_ROOT/fairseq/fairseq_cli/generate.py $DATA_DIR \
--gen-subset ${gen_set}_st \
--max-tokens 2000000 \
--max-source-positions 2000000 \
--num-workers 0 \
\
--user-dir $CODE_ROOT/speechut \
--task speech_to_text \
--config-yaml config_en${lang}.yaml \
\
--path ${model_path} \
--results-path $results_path \
\
--scoring sacrebleu --max-len-a 0 --max-len-b 512 \
--beam ${beam_size} \
--lenpen $lenpen \
# --model-overrides "{'model':{'w2v_path':'/path/to/your/pretrained/model.pt'}}" \
echo $results_path
tail -n 1 $results_path/generate-*.txt
sleep 1s
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import collections
import os
import re
import torch
from fairseq.file_io import PathManager
def average_checkpoints(inputs):
"""Loads checkpoints from inputs and returns a model with averaged weights.
Args:
inputs: An iterable of string paths of checkpoints to load from.
Returns:
A dict of string keys mapping to various values. The 'model' key
from the returned dict should correspond to an OrderedDict mapping
string parameter names to torch Tensors.
"""
params_dict = collections.OrderedDict()
params_keys = None
new_state = None
num_models = len(inputs)
for fpath in inputs:
with PathManager.open(fpath, "rb") as f:
state = torch.load(
f,
map_location=(
lambda s, _: torch.serialization.default_restore_location(s, "cpu")
),
)
# Copies over the settings from the first checkpoint
if new_state is None:
new_state = state
model_params = state["model"]
model_params_keys = list(model_params.keys())
if params_keys is None:
params_keys = model_params_keys
elif params_keys != model_params_keys:
raise KeyError(
"For checkpoint {}, expected list of params: {}, "
"but found: {}".format(f, params_keys, model_params_keys)
)
for k in params_keys:
p = model_params[k]
if isinstance(p, torch.HalfTensor):
p = p.float()
if k not in params_dict:
params_dict[k] = p.clone()
# NOTE: clone() is needed in case of p is a shared parameter
else:
params_dict[k] += p
averaged_params = collections.OrderedDict()
for k, v in params_dict.items():
averaged_params[k] = v
if averaged_params[k].is_floating_point():
averaged_params[k].div_(num_models)
else:
averaged_params[k] //= num_models
new_state["model"] = averaged_params
return new_state
def last_n_checkpoints(paths, n, update_based, upper_bound=None):
assert len(paths) == 1
path = paths[0]
if update_based:
pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
else:
pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
files = PathManager.ls(path)
entries = []
for f in files:
m = pt_regexp.fullmatch(f)
if m is not None:
sort_key = int(m.group(1))
if upper_bound is None or sort_key <= upper_bound:
entries.append((sort_key, m.group(0)))
if len(entries) < n:
raise Exception(
"Found {} checkpoint files but need at least {}", len(entries), n
)
return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
def main():
parser = argparse.ArgumentParser(
description="Tool to average the params of input checkpoints to "
"produce a new checkpoint",
)
# fmt: off
parser.add_argument('--inputs', required=True, nargs='+',
help='Input checkpoint file paths.')
parser.add_argument('--output', required=True, metavar='FILE',
help='Write the new checkpoint containing the averaged weights to this path.')
num_group = parser.add_mutually_exclusive_group()
num_group.add_argument('--num-epoch-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint_xx.pt in the '
'path specified by input, and average last this many of them.')
num_group.add_argument('--num-update-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by'
' input, and average last this many of them.')
parser.add_argument('--checkpoint-upper-bound', type=int,
help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
'when using --num-update-checkpoints, this will set an upper bound on which update to use'
'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be'
' averaged.'
'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would'
' be averaged assuming --save-interval-updates 500'
)
# fmt: on
args = parser.parse_args()
print(args)
num = None
is_update_based = False
if args.num_update_checkpoints is not None:
num = args.num_update_checkpoints
is_update_based = True
elif args.num_epoch_checkpoints is not None:
num = args.num_epoch_checkpoints
assert args.checkpoint_upper_bound is None or (
args.num_epoch_checkpoints is not None
or args.num_update_checkpoints is not None
), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
assert (
args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"
if num is not None:
args.inputs = last_n_checkpoints(
args.inputs,
num,
is_update_based,
upper_bound=args.checkpoint_upper_bound,
)
print("averaging checkpoints: ", args.inputs)
new_state = average_checkpoints(args.inputs)
with PathManager.open(args.output, "wb") as f:
torch.save(new_state, f)
print("Finished writing averaged checkpoint to {}".format(args.output))
if __name__ == "__main__":
main()
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Use this script in order to build symmetric alignments for your translation
dataset.
This script depends on fast_align and mosesdecoder tools. You will need to
build those before running the script.
fast_align:
github: http://github.com/clab/fast_align
instructions: follow the instructions in README.md
mosesdecoder:
github: http://github.com/moses-smt/mosesdecoder
instructions: http://www.statmt.org/moses/?n=Development.GetStarted
The script produces the following files under --output_dir:
text.joined - concatenation of lines from the source_file and the
target_file.
align.forward - forward pass of fast_align.
align.backward - backward pass of fast_align.
aligned.sym_heuristic - symmetrized alignment.
"""
import argparse
import os
from itertools import zip_longest
def main():
parser = argparse.ArgumentParser(description="symmetric alignment builer")
# fmt: off
parser.add_argument('--fast_align_dir',
help='path to fast_align build directory')
parser.add_argument('--mosesdecoder_dir',
help='path to mosesdecoder root directory')
parser.add_argument('--sym_heuristic',
help='heuristic to use for symmetrization',
default='grow-diag-final-and')
parser.add_argument('--source_file',
help='path to a file with sentences '
'in the source language')
parser.add_argument('--target_file',
help='path to a file with sentences '
'in the target language')
parser.add_argument('--output_dir',
help='output directory')
# fmt: on
args = parser.parse_args()
fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
sym_fast_align_bin = os.path.join(
args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
)
# create joined file
joined_file = os.path.join(args.output_dir, "text.joined")
with open(args.source_file, "r", encoding="utf-8") as src, open(
args.target_file, "r", encoding="utf-8"
) as tgt:
with open(joined_file, "w", encoding="utf-8") as joined:
for s, t in zip_longest(src, tgt):
print("{} ||| {}".format(s.strip(), t.strip()), file=joined)
bwd_align_file = os.path.join(args.output_dir, "align.backward")
# run forward alignment
fwd_align_file = os.path.join(args.output_dir, "align.forward")
fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
)
assert os.system(fwd_fast_align_cmd) == 0
# run backward alignment
bwd_align_file = os.path.join(args.output_dir, "align.backward")
bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
)
assert os.system(bwd_fast_align_cmd) == 0
# run symmetrization
sym_out_file = os.path.join(args.output_dir, "aligned")
sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
SYMFASTALIGN=sym_fast_align_bin,
FWD=fwd_align_file,
BWD=bwd_align_file,
SRC=args.source_file,
TGT=args.target_file,
OUT=sym_out_file,
HEURISTIC=args.sym_heuristic,
SYMAL=symal_bin,
)
assert os.system(sym_cmd) == 0
if __name__ == "__main__":
main()
#!/usr/bin/env python
"""Helper script to compare two argparse.Namespace objects."""
from argparse import Namespace # noqa
def main():
ns1 = eval(input("Namespace 1: "))
ns2 = eval(input("Namespace 2: "))
def keys(ns):
ks = set()
for k in dir(ns):
if not k.startswith("_"):
ks.add(k)
return ks
k1 = keys(ns1)
k2 = keys(ns2)
def print_keys(ks, ns1, ns2=None):
for k in ks:
if ns2 is None:
print("{}\t{}".format(k, getattr(ns1, k, None)))
else:
print(
"{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None))
)
print("Keys unique to namespace 1:")
print_keys(k1 - k2, ns1)
print()
print("Keys unique to namespace 2:")
print_keys(k2 - k1, ns2)
print()
print("Overlapping keys with different values:")
ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")]
print_keys(ks, ns1, ns2)
print()
if __name__ == "__main__":
main()
#!/bin/bash
if [ $# -ne 1 ]; then
echo "usage: $0 GENERATE_PY_OUTPUT"
exit 1
fi
GEN=$1
SYS=$GEN.sys
REF=$GEN.ref
if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then
echo "not done generating"
exit
fi
grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS
grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF
fairseq-score --sys $SYS --ref $REF
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""Extracts random constraints from reference files."""
import argparse
import random
import sys
def get_phrase(words, index, length):
assert index < len(words) - length + 1
phr = " ".join(words[index : index + length])
for i in range(index, index + length):
words.pop(index)
return phr
def main(args):
if args.seed:
random.seed(args.seed)
for line in sys.stdin:
constraints = []
def add_constraint(constraint):
constraints.append(constraint)
source = line.rstrip()
if "\t" in line:
source, target = line.split("\t")
if args.add_sos:
target = f"<s> {target}"
if args.add_eos:
target = f"{target} </s>"
if len(target.split()) >= args.len:
words = [target]
num = args.number
choices = {}
for i in range(num):
if len(words) == 0:
break
segmentno = random.choice(range(len(words)))
segment = words.pop(segmentno)
tokens = segment.split()
phrase_index = random.choice(range(len(tokens)))
choice = " ".join(
tokens[phrase_index : min(len(tokens), phrase_index + args.len)]
)
for j in range(
phrase_index, min(len(tokens), phrase_index + args.len)
):
tokens.pop(phrase_index)
if phrase_index > 0:
words.append(" ".join(tokens[0:phrase_index]))
if phrase_index + 1 < len(tokens):
words.append(" ".join(tokens[phrase_index:]))
choices[target.find(choice)] = choice
# mask out with spaces
target = target.replace(choice, " " * len(choice), 1)
for key in sorted(choices.keys()):
add_constraint(choices[key])
print(source, *constraints, sep="\t")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--number", "-n", type=int, default=1, help="number of phrases")
parser.add_argument("--len", "-l", type=int, default=1, help="phrase length")
parser.add_argument(
"--add-sos", default=False, action="store_true", help="add <s> token"
)
parser.add_argument(
"--add-eos", default=False, action="store_true", help="add </s> token"
)
parser.add_argument("--seed", "-s", default=0, type=int)
args = parser.parse_args()
main(args)
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import sys
"""Reads in a fairseq output file, and verifies that the constraints
(C- lines) are present in the output (the first H- line). Assumes that
constraints are listed prior to the first hypothesis.
"""
constraints = []
found = 0
total = 0
for line in sys.stdin:
if line.startswith("C-"):
constraints.append(line.rstrip().split("\t")[1])
elif line.startswith("H-"):
text = line.split("\t")[2]
for constraint in constraints:
total += 1
if constraint in text:
found += 1
else:
print(f"No {constraint} in {text}", file=sys.stderr)
constraints = []
print(f"Found {found} / {total} = {100 * found / total:.1f}%")
-- Copyright (c) Facebook, Inc. and its affiliates.
--
-- This source code is licensed under the MIT license found in the
-- LICENSE file in the root directory of this source tree.
--
-- Usage: convert_dictionary.lua <dict.th7>
require 'fairseq'
require 'torch'
require 'paths'
if #arg < 1 then
print('usage: convert_dictionary.lua <dict.th7>')
os.exit(1)
end
if not paths.filep(arg[1]) then
print('error: file does not exit: ' .. arg[1])
os.exit(1)
end
dict = torch.load(arg[1])
dst = paths.basename(arg[1]):gsub('.th7', '.txt')
assert(dst:match('.txt$'))
f = io.open(dst, 'w')
for idx, symbol in ipairs(dict.index_to_symbol) do
if idx > dict.cutoff then
break
end
f:write(symbol)
f:write(' ')
f:write(dict.index_to_freq[idx])
f:write('\n')
end
f:close()
-- Copyright (c) Facebook, Inc. and its affiliates.
--
-- This source code is licensed under the MIT license found in the
-- LICENSE file in the root directory of this source tree.
--
-- Usage: convert_model.lua <model_epoch1.th7>
require 'torch'
local fairseq = require 'fairseq'
model = torch.load(arg[1])
function find_weight_norm(container, module)
for _, wn in ipairs(container:listModules()) do
if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then
return wn
end
end
end
function push_state(dict, key, module)
if torch.type(module) == 'nn.Linear' then
local wn = find_weight_norm(model.module, module)
assert(wn)
dict[key .. '.weight_v'] = wn.v:float()
dict[key .. '.weight_g'] = wn.g:float()
elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then
local wn = find_weight_norm(model.module, module)
assert(wn)
local v = wn.v:float():view(wn.viewOut):transpose(2, 3)
dict[key .. '.weight_v'] = v
dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1)
else
dict[key .. '.weight'] = module.weight:float()
end
if module.bias then
dict[key .. '.bias'] = module.bias:float()
end
end
encoder_dict = {}
decoder_dict = {}
combined_dict = {}
function encoder_state(encoder)
luts = encoder:findModules('nn.LookupTable')
push_state(encoder_dict, 'embed_tokens', luts[1])
push_state(encoder_dict, 'embed_positions', luts[2])
fcs = encoder:findModules('nn.Linear')
assert(#fcs >= 2)
local nInputPlane = fcs[1].weight:size(1)
push_state(encoder_dict, 'fc1', table.remove(fcs, 1))
push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs))
for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do
push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module)
if nInputPlane ~= module.weight:size(3) / 2 then
push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
end
nInputPlane = module.weight:size(3) / 2
end
assert(#fcs == 0)
end
function decoder_state(decoder)
luts = decoder:findModules('nn.LookupTable')
push_state(decoder_dict, 'embed_tokens', luts[1])
push_state(decoder_dict, 'embed_positions', luts[2])
fcs = decoder:findModules('nn.Linear')
local nInputPlane = fcs[1].weight:size(1)
push_state(decoder_dict, 'fc1', table.remove(fcs, 1))
push_state(decoder_dict, 'fc2', fcs[#fcs - 1])
push_state(decoder_dict, 'fc3', fcs[#fcs])
table.remove(fcs, #fcs)
table.remove(fcs, #fcs)
for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do
if nInputPlane ~= module.weight:size(3) / 2 then
push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
end
nInputPlane = module.weight:size(3) / 2
local prefix = 'attention.' .. tostring(i - 1)
push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1))
push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1))
push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module)
end
assert(#fcs == 0)
end
_encoder = model.module.modules[2]
_decoder = model.module.modules[3]
encoder_state(_encoder)
decoder_state(_decoder)
for k, v in pairs(encoder_dict) do
combined_dict['encoder.' .. k] = v
end
for k, v in pairs(decoder_dict) do
combined_dict['decoder.' .. k] = v
end
torch.save('state_dict.t7', combined_dict)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Count the number of documents and average number of lines and tokens per
document in a large file. Documents should be separated by a single empty line.
"""
import argparse
import gzip
import sys
import numpy as np
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("--gzip", action="store_true")
args = parser.parse_args()
def gopen():
if args.gzip:
return gzip.open(args.input, "r")
else:
return open(args.input, "r", encoding="utf-8")
num_lines = []
num_toks = []
with gopen() as h:
num_docs = 1
num_lines_in_doc = 0
num_toks_in_doc = 0
for i, line in enumerate(h):
if len(line.strip()) == 0: # empty line indicates new document
num_docs += 1
num_lines.append(num_lines_in_doc)
num_toks.append(num_toks_in_doc)
num_lines_in_doc = 0
num_toks_in_doc = 0
else:
num_lines_in_doc += 1
num_toks_in_doc += len(line.rstrip().split())
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
print(file=sys.stderr, flush=True)
print("found {} docs".format(num_docs))
print("average num lines per doc: {}".format(np.mean(num_lines)))
print("average num toks per doc: {}".format(np.mean(num_toks)))
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
from fairseq.data import Dictionary, data_utils, indexed_dataset
def get_parser():
parser = argparse.ArgumentParser(
description="writes text from binarized file to stdout"
)
# fmt: off
parser.add_argument('--dataset-impl', help='dataset implementation',
choices=indexed_dataset.get_available_dataset_impl())
parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
# fmt: on
return parser
def main():
parser = get_parser()
args = parser.parse_args()
dictionary = Dictionary.load(args.dict) if args.dict is not None else None
dataset = data_utils.load_indexed_dataset(
args.input,
dictionary,
dataset_impl=args.dataset_impl,
default="lazy",
)
for tensor_line in dataset:
if dictionary is None:
line = " ".join([str(int(x)) for x in tensor_line])
else:
line = dictionary.string(tensor_line)
print(line)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import os
import re
import shutil
import sys
pt_regexp = re.compile(r"checkpoint(\d+|_\d+_\d+|_[a-z]+)\.pt")
pt_regexp_epoch_based = re.compile(r"checkpoint(\d+)\.pt")
pt_regexp_update_based = re.compile(r"checkpoint_\d+_(\d+)\.pt")
def parse_checkpoints(files):
entries = []
for f in files:
m = pt_regexp_epoch_based.fullmatch(f)
if m is not None:
entries.append((int(m.group(1)), m.group(0)))
else:
m = pt_regexp_update_based.fullmatch(f)
if m is not None:
entries.append((int(m.group(1)), m.group(0)))
return entries
def last_n_checkpoints(files, n):
entries = parse_checkpoints(files)
return [x[1] for x in sorted(entries, reverse=True)[:n]]
def every_n_checkpoints(files, n):
entries = parse_checkpoints(files)
return [x[1] for x in sorted(sorted(entries)[::-n])]
def main():
parser = argparse.ArgumentParser(
description=(
"Recursively delete checkpoint files from `root_dir`, "
"but preserve checkpoint_best.pt and checkpoint_last.pt"
)
)
parser.add_argument("root_dirs", nargs="*")
parser.add_argument(
"--save-last", type=int, default=0, help="number of last checkpoints to save"
)
parser.add_argument(
"--save-every", type=int, default=0, help="interval of checkpoints to save"
)
parser.add_argument(
"--preserve-test",
action="store_true",
help="preserve checkpoints in dirs that start with test_ prefix (default: delete them)",
)
parser.add_argument(
"--delete-best", action="store_true", help="delete checkpoint_best.pt"
)
parser.add_argument(
"--delete-last", action="store_true", help="delete checkpoint_last.pt"
)
parser.add_argument(
"--no-dereference", action="store_true", help="don't dereference symlinks"
)
args = parser.parse_args()
files_to_desymlink = []
files_to_preserve = []
files_to_delete = []
for root_dir in args.root_dirs:
for root, _subdirs, files in os.walk(root_dir):
if args.save_last > 0:
to_save = last_n_checkpoints(files, args.save_last)
else:
to_save = []
if args.save_every > 0:
to_save += every_n_checkpoints(files, args.save_every)
for file in files:
if not pt_regexp.fullmatch(file):
continue
full_path = os.path.join(root, file)
if (
not os.path.basename(root).startswith("test_") or args.preserve_test
) and (
(file == "checkpoint_last.pt" and not args.delete_last)
or (file == "checkpoint_best.pt" and not args.delete_best)
or file in to_save
):
if os.path.islink(full_path) and not args.no_dereference:
files_to_desymlink.append(full_path)
else:
files_to_preserve.append(full_path)
else:
files_to_delete.append(full_path)
if len(files_to_desymlink) == 0 and len(files_to_delete) == 0:
print("Nothing to do.")
sys.exit(0)
files_to_desymlink = sorted(files_to_desymlink)
files_to_preserve = sorted(files_to_preserve)
files_to_delete = sorted(files_to_delete)
print("Operations to perform (in order):")
if len(files_to_desymlink) > 0:
for file in files_to_desymlink:
print(" - preserve (and dereference symlink): " + file)
if len(files_to_preserve) > 0:
for file in files_to_preserve:
print(" - preserve: " + file)
if len(files_to_delete) > 0:
for file in files_to_delete:
print(" - delete: " + file)
while True:
resp = input("Continue? (Y/N): ")
if resp.strip().lower() == "y":
break
elif resp.strip().lower() == "n":
sys.exit(0)
print("Executing...")
if len(files_to_desymlink) > 0:
for file in files_to_desymlink:
realpath = os.path.realpath(file)
print("rm " + file)
os.remove(file)
print("cp {} {}".format(realpath, file))
shutil.copyfile(realpath, file)
if len(files_to_delete) > 0:
for file in files_to_delete:
print("rm " + file)
os.remove(file)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment