Commit 12c90639 authored by “change”'s avatar “change”
Browse files

init

parent 417b607b
#!/bin/bash
if [ $# -ne 4 ]; then
echo "usage: $0 TESTSET SRCLANG TGTLANG GEN"
exit 1
fi
TESTSET=$1
SRCLANG=$2
TGTLANG=$3
GEN=$4
if ! command -v sacremoses &> /dev/null
then
echo "sacremoses could not be found, please install with: pip install sacremoses"
exit
fi
grep ^H $GEN \
| sed 's/^H\-//' \
| sort -n -k 1 \
| cut -f 3 \
| sacremoses detokenize \
> $GEN.sorted.detok
sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Split a large file into shards while respecting document boundaries. Documents
should be separated by a single empty line.
"""
import argparse
import contextlib
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("--num-shards", type=int)
args = parser.parse_args()
assert args.num_shards is not None and args.num_shards > 1
with open(args.input, "r", encoding="utf-8") as h:
with contextlib.ExitStack() as stack:
outputs = [
stack.enter_context(
open(args.input + ".shard" + str(i), "w", encoding="utf-8")
)
for i in range(args.num_shards)
]
doc = []
first_doc = [True] * args.num_shards
def output_doc(i):
if not first_doc[i]:
outputs[i].write("\n")
first_doc[i] = False
for line in doc:
outputs[i].write(line)
doc.clear()
num_docs = 0
for line in h:
if line.strip() == "": # empty line indicates new document
output_doc(num_docs % args.num_shards)
num_docs += 1
else:
doc.append(line)
output_doc(num_docs % args.num_shards)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Split a large file into a train and valid set while respecting document
boundaries. Documents should be separated by a single empty line.
"""
import argparse
import random
import sys
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input")
parser.add_argument("sample_output", help="train output file")
parser.add_argument("remainder_output", help="valid output file")
parser.add_argument("-k", type=int, help="remainder size")
parser.add_argument(
"--lines", action="store_true", help="split lines instead of docs"
)
args = parser.parse_args()
assert args.k is not None
sample = []
remainder = []
num_docs = [0]
def update_sample(doc):
if len(sample) < args.k:
sample.append(doc.copy())
else:
i = num_docs[0]
j = random.randrange(i + 1)
if j < args.k:
remainder.append(sample[j])
sample[j] = doc.copy()
else:
remainder.append(doc.copy())
num_docs[0] += 1
doc.clear()
with open(args.input, "r", encoding="utf-8") as h:
doc = []
for i, line in enumerate(h):
if line.strip() == "": # empty line indicates new document
update_sample(doc)
else:
doc.append(line)
if args.lines:
update_sample(doc)
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
if len(doc) > 0:
update_sample(doc)
print(file=sys.stderr, flush=True)
assert len(sample) == args.k
with open(args.sample_output, "w", encoding="utf-8") as out:
first = True
for doc in sample:
if not first and not args.lines:
out.write("\n")
first = False
for line in doc:
out.write(line)
with open(args.remainder_output, "w", encoding="utf-8") as out:
first = True
for doc in remainder:
if not first and not args.lines:
out.write("\n")
first = False
for line in doc:
out.write(line)
if __name__ == "__main__":
main()
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import sentencepiece as spm
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", required=True, help="sentencepiece model to use for decoding"
)
parser.add_argument("--input", required=True, help="input file to decode")
parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
args = parser.parse_args()
sp = spm.SentencePieceProcessor()
sp.Load(args.model)
if args.input_format == "piece":
def decode(input):
return "".join(sp.DecodePieces(input))
elif args.input_format == "id":
def decode(input):
return "".join(sp.DecodeIds(input))
else:
raise NotImplementedError
def tok2int(tok):
# remap reference-side <unk> (represented as <<unk>>) to 0
return int(tok) if tok != "<<unk>>" else 0
with open(args.input, "r", encoding="utf-8") as h:
for line in h:
if args.input_format == "id":
print(decode(list(map(tok2int, line.rstrip().split()))))
elif args.input_format == "piece":
print(decode(line.rstrip().split()))
if __name__ == "__main__":
main()
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import contextlib
import sys
import sentencepiece as spm
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", required=True, help="sentencepiece model to use for encoding"
)
parser.add_argument(
"--inputs", nargs="+", default=["-"], help="input files to filter/encode"
)
parser.add_argument(
"--outputs", nargs="+", default=["-"], help="path to save encoded outputs"
)
parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
parser.add_argument(
"--min-len",
type=int,
metavar="N",
help="filter sentence pairs with fewer than N tokens",
)
parser.add_argument(
"--max-len",
type=int,
metavar="N",
help="filter sentence pairs with more than N tokens",
)
args = parser.parse_args()
assert len(args.inputs) == len(
args.outputs
), "number of input and output paths should match"
sp = spm.SentencePieceProcessor()
sp.Load(args.model)
if args.output_format == "piece":
def encode(input):
return sp.EncodeAsPieces(input)
elif args.output_format == "id":
def encode(input):
return list(map(str, sp.EncodeAsIds(input)))
else:
raise NotImplementedError
if args.min_len is not None or args.max_len is not None:
def valid(line):
return (args.min_len is None or len(line) >= args.min_len) and (
args.max_len is None or len(line) <= args.max_len
)
else:
def valid(lines):
return True
with contextlib.ExitStack() as stack:
inputs = [
stack.enter_context(open(input, "r", encoding="utf-8"))
if input != "-"
else sys.stdin
for input in args.inputs
]
outputs = [
stack.enter_context(open(output, "w", encoding="utf-8"))
if output != "-"
else sys.stdout
for output in args.outputs
]
stats = {
"num_empty": 0,
"num_filtered": 0,
}
def encode_line(line):
line = line.strip()
if len(line) > 0:
line = encode(line)
if valid(line):
return line
else:
stats["num_filtered"] += 1
else:
stats["num_empty"] += 1
return None
for i, lines in enumerate(zip(*inputs), start=1):
enc_lines = list(map(encode_line, lines))
if not any(enc_line is None for enc_line in enc_lines):
for enc_line, output_h in zip(enc_lines, outputs):
print(" ".join(enc_line), file=output_h)
if i % 10000 == 0:
print("processed {} lines".format(i), file=sys.stderr)
print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import sentencepiece as spm
if __name__ == "__main__":
spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
#!/usr/bin/env bash
rm -rf fsdp_dummy
mkdir -p fsdp_dummy
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
--ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
--cpu-offload --checkpoint-activations \
--task language_modeling --tokens-per-sample 256 --batch-size 8 \
--arch transformer_lm_gpt2_tiny \
--optimizer cpu_adam --adam-betas "(0.9,0.98)" \
--lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
--max-update 5 --log-format json --log-interval 1 \
--save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \
--restore-file x.pt "$@"
# Now we try to load the checkpoint
CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \
--ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \
--cpu-offload --checkpoint-activations \
--task language_modeling --tokens-per-sample 256 --batch-size 8 \
--arch transformer_lm_gpt2_tiny \
--optimizer cpu_adam --adam-betas "(0.9,0.98)" \
--lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \
--max-update 2 --log-format json --log-interval 1 \
--save-interval-updates 2 --save-dir fsdp_dummy
# ####################################
# Hubert SCT2T ED model #
# ####################################
world_size=$1
update_freq=$2
exp_name=$3
[ -z $world_size ] && world_size=8
[ -z $update_freq ] && update_freq=1
[ -z $exp_name ] && exp_name=sc2t_base_enes_${world_size}gpu_${update_freq}accum6666
FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_enes"
TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_enes/bin-idx"
MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_enes/$exp_name"
[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
--config-dir $CONFIG_DIR/pretrain \
--config-name sc2t_base_librispeech \
\
+task.store_labels=true \
task.labels='["km"]' \
model.label_rate=50 \
task.data=$DATA_DIR \
task.label_dir=$DATA_DIR \
task.text_cfg.text_data=$TEXT_DATA_DIR \
+task.text_cfg.data_config=config.yaml \
task.text_cfg.text_maxtokens_ratio=3.0 \
\
+criterion.dec_loss_type="ce" \
\
criterion.text_weight=1.0 \
\
model.use_rel_pos_enc=true \
+model.code_use_rel_pos_enc=true \
+model.pad_with_code=true \
model.text_transformer.no_scale_embedding=true \
model.text_transformer.layernorm_embedding=true \
+model.share_decoder_input_output_embed=true \
\
dataset.train_subset=\"train_all+en.kmu-spm\" \
dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
dataset.num_workers=0 \
dataset.max_tokens=1000000 \
optimization.update_freq=[${update_freq}] \
optimization.max_update=400000 \
\
distributed_training.distributed_world_size=${world_size} \
\
common.tensorboard_logdir=$MODEL_DIR \
checkpoint.save_dir=$MODEL_DIR \
hydra.run.dir=$MODEL_DIR \
hydra.job.name=${exp_name}
sleep 5m
echo "All finished"
# ####################################
# Hubert SCT2T ED model #
# ####################################
world_size=$1
update_freq=$2
exp_name=$3
[ -z $world_size ] && world_size=24
[ -z $update_freq ] && update_freq=3
[ -z $exp_name ] && exp_name=sc2t_base_esen_${world_size}gpu_${update_freq}accum1
FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq_mlstku
CONFIG_DIR=/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/speech_esen"
TEXT_DATA_DIR="/mnt/output/users/v-kunwei/data/s2s_data/text_esen"
MODEL_DIR="/mnt/output/v-kunwei/data/s2s_data/exp/S2S_esen/$exp_name"
[ -d $MODEL_DIR ] || mkdir -p $MODEL_DIR
python $FAIRSEQ_ROOT/fairseq_cli/hydra_train.py \
--config-dir $CONFIG_DIR/pretrain \
--config-name sc2t_base_librispeech \
\
+task.store_labels=true \
task.labels='["km"]' \
model.label_rate=50 \
task.data=$DATA_DIR \
task.label_dir=$DATA_DIR \
task.text_cfg.text_data=$TEXT_DATA_DIR \
+task.text_cfg.data_config=config.yaml \
task.text_cfg.text_maxtokens_ratio=3.0 \
\
+criterion.dec_loss_type="ce" \
\
criterion.text_weight=1.0 \
\
model.use_rel_pos_enc=true \
+model.code_use_rel_pos_enc=true \
+model.pad_with_code=true \
model.text_transformer.no_scale_embedding=true \
model.text_transformer.layernorm_embedding=true \
+model.share_decoder_input_output_embed=true \
\
dataset.train_subset=\"train+en.kmu-spm\" \
dataset.valid_subset=\"valid+en_valid.kmu-spm\" \
dataset.num_workers=0 \
dataset.max_tokens=1000000 \
optimization.update_freq=[${update_freq}] \
optimization.max_update=400000 \
\
distributed_training.distributed_world_size=${world_size} \
\
common.tensorboard_logdir=$MODEL_DIR \
checkpoint.save_dir=$MODEL_DIR \
hydra.run.dir=$MODEL_DIR \
hydra.job.name=${exp_name}
sleep 5m
echo "All finished"
audio_root: ./
standardize_audio: true
use_audio_input: true
vocab_filename: dict.txt
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tblog
seed: 1337
checkpoint:
save_interval: 1
keep_last_epochs: 5
keep_best_checkpoints: 5
best_checkpoint_metric: wer
restore_file: checkpoint_last.pt
distributed_training:
ddp_backend: c10d
find_unused_parameters: true
distributed_world_size: 1
distributed_port: -1
nprocs_per_node: 8
task:
_name: hubert_pretraining
data: ???
fine_tuning: true
label_dir: ???
normalize: false # must be consistent with pre-training
labels: ["ltr"]
single_target: true
add_decoder: false
pad_audio: false
random_crop: true
tokenizer: "none"
sp_path: None
dataset:
num_workers: 0
max_tokens: 1200000
skip_invalid_size_inputs_valid_test: true
train_subset: train_100
valid_subset: dev_other
required_batch_size_multiple: 1
criterion:
_name: label_smoothed_cross_entropy
#zero_infinity: true
optimization:
max_update: 80000
lr: [0.00003]
sentence_avg: true
update_freq: [1]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-08
weight_decay: 0.0
lr_scheduler:
_name: tri_stage
phase_ratio: [0.1, 0.4, 0.5]
final_lr_scale: 0.05
model:
_name: hubert_ctc
w2v_path: ???
apply_mask: true
mask_prob: 0.65
mask_channel_prob: 0.5
mask_channel_length: 64
layerdrop: 0.1
decoder_layerdrop: 0.1
activation_dropout: 0.1
feature_grad_mult: 0.0
freeze_finetune_updates: 0
add_decoder: false
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
- model.w2v_path
- dataset.train_subset
- dataset.valid_subset
- criterion.wer_kenlm_model
- criterion.wer_lexicon
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tblog
checkpoint:
save_interval: 1
keep_last_epochs: 10
keep_best_checkpoints: 5
best_checkpoint_metric: wer
restore_file: checkpoint_last.pt
distributed_training:
ddp_backend: c10d
find_unused_parameters: true
distributed_world_size: 24
distributed_port: -1
nprocs_per_node: 8
task:
_name: hubert_pretraining
data: ???
fine_tuning: true
label_dir: ???
normalize: true # must be consistent with pre-training
labels: ["ltr"]
single_target: true
add_decoder: false
pad_audio: false
random_crop: true
tokenizer: "none"
sp_path: None
dataset:
num_workers: 0
max_tokens: 1280000
skip_invalid_size_inputs_valid_test: true
valid_subset: dev_other
required_batch_size_multiple: 1
criterion:
_name: ctc
zero_infinity: true
optimization:
max_update: 200000
lr: [0.00003]
sentence_avg: true
update_freq: [1]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-08
weight_decay: 0.0
lr_scheduler:
_name: tri_stage
phase_ratio: [0.1, 0.4, 0.5]
final_lr_scale: 0.05
model:
_name: hubert_ctc
w2v_path: ???
apply_mask: true
mask_prob: 0.5
mask_channel_prob: 0.25
mask_channel_length: 64
layerdrop: 0.0
decoder_layerdrop: 0.1
activation_dropout: 0.1
feature_grad_mult: 0.0
freeze_finetune_updates: 0
add_decoder: false
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
- model.w2v_path
- dataset.train_subset
- dataset.valid_subset
- criterion.wer_kenlm_model
- criterion.wer_lexicon
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_dir: ???
save_interval: 4
keep_last_epochs: 4
save_interval_updates: 20000
keep_interval_updates: -1
keep_interval_updates_pattern: 50000
# no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 8
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: denoising
data: ???
mask: 0.15
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
validate_interval: ${checkpoint.save_interval}
validate_interval_updates: ${checkpoint.save_interval_updates}
required_batch_size_multiple: 1
criterion:
_name: sc2t
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
label_smoothing: 0.1
text_weight: 0.1
optimization:
max_update: 400000
lr: [0.0005]
clip_norm: 10.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: stbert
label_rate: ???
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: default
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
final_dim: 256
encoder_layers: 6
encoder_attention_heads: 8
decoder_layerdrop: 0.05
dropout_input: 0.1
dropout_features: 0.1
dropout: 0.1
attention_dropout: 0.1
feature_grad_mult: 0.1
untie_final_proj: true
activation_dropout: 0.0
use_rel_pos_enc: true
add_code_encoder: true
add_adaptor: false
text_transformer:
activation_fn: ${model.activation_fn}
dropout: ${model.dropout}
attention_dropout: ${model.attention_dropout}
activation_dropout: ${model.activation_dropout}
adaptive_input: ${model.adaptive_input}
max_source_positions: 3000
checkpoint_activations: ${model.checkpoint_activations}
no_scale_embedding: false
layernorm_embedding: false
quant_noise:
pq: ${model.quant_noise_pq}
encoder:
embed_dim: 768
ffn_embed_dim: 3072
layers: 6
attention_heads: 8
normalize_before: false
learned_pos: true
layerdrop: ${model.encoder_layerdrop}
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_dir: ???
save_interval: 4
keep_last_epochs: 4
save_interval_updates: 20000
keep_interval_updates: -1
keep_interval_updates_pattern: 50000
# no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 8
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: joint_sc2t_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: false # must be consistent with extractor
add_decoder: true
text_cfg:
seed: ${common.seed}
text_data: ???
sample_break_mode: eos
tokens_per_sample: 1024
shorten_method: "random_crop"
text_maxtokens_ratio: 1.0
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
validate_interval: ${checkpoint.save_interval}
validate_interval_updates: ${checkpoint.save_interval_updates}
required_batch_size_multiple: 1
criterion:
_name: sc2t
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
label_smoothing: 0.1
text_weight: 0.1
optimization:
max_update: 400000
lr: [0.0005]
clip_norm: 10.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: stbert
label_rate: ???
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: default
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
final_dim: 256
encoder_layers: 6
encoder_attention_heads: 8
decoder_layerdrop: 0.05
dropout_input: 0.1
dropout_features: 0.1
dropout: 0.1
attention_dropout: 0.1
feature_grad_mult: 0.1
untie_final_proj: true
activation_dropout: 0.0
use_rel_pos_enc: true
add_code_encoder: true
add_adaptor: false
text_transformer:
activation_fn: ${model.activation_fn}
dropout: ${model.dropout}
attention_dropout: ${model.attention_dropout}
activation_dropout: ${model.activation_dropout}
adaptive_input: ${model.adaptive_input}
max_source_positions: 3000
checkpoint_activations: ${model.checkpoint_activations}
no_scale_embedding: false
layernorm_embedding: false
quant_noise:
pq: ${model.quant_noise_pq}
encoder:
embed_dim: 768
ffn_embed_dim: 3072
layers: 6
attention_heads: 8
normalize_before: false
learned_pos: true
layerdrop: ${model.encoder_layerdrop}
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
tensorboard_logdir: tblog
seed: 1337
checkpoint:
save_interval: 1000000
keep_last_epochs: 5
save_interval_updates: 1000
keep_interval_updates_pattern: 10000
keep_interval_updates: 5
best_checkpoint_metric: accuracy
maximize_best_checkpoint_metric: true
distributed_training:
ddp_backend: c10d
find_unused_parameters: true
distributed_world_size: 1
nprocs_per_node: 8
criterion:
_name: "label_smoothed_cross_entropy"
task:
_name: "translation_from_jst"
dataset:
num_workers: 0
max_tokens: 4096
skip_invalid_size_inputs_valid_test: true
validate_after_updates: ${model.freeze_finetune_updates}
validate_interval: ${checkpoint.save_interval}
validate_interval_updates: ${checkpoint.save_interval_updates}
train_subset: train_clean_100
valid_subset: dev_clean
required_batch_size_multiple: 1
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.0
lr_scheduler:
_name: tri_stage
phase_ratio: [0.1, 0.4, 0.5]
final_lr_scale: 0.05
model:
_name: hubert_t2c
w2v_path: ???
layerdrop: 0.1
decoder_layerdrop: 0.1
activation_dropout: 0.1
feature_grad_mult: 0.0
freeze_finetune_updates: 0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
- model.w2v_path
- dataset.train_subset
- dataset.valid_subset
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_dir: ???
save_interval: 4
keep_last_epochs: 4
save_interval_updates: 20000
keep_interval_updates: -1
keep_interval_updates_pattern: 50000
# no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 8
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: denoising
data: ???
mask: 0.15
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
validate_interval: ${checkpoint.save_interval}
validate_interval_updates: ${checkpoint.save_interval_updates}
required_batch_size_multiple: 1
criterion:
_name: sc2t
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
label_smoothing: 0.1
text_weight: 0.1
optimization:
max_update: 400000
lr: [0.0005]
clip_norm: 10.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: stbert
label_rate: ???
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: default
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
final_dim: 256
encoder_layers: 6
encoder_attention_heads: 8
decoder_layerdrop: 0.05
dropout_input: 0.1
dropout_features: 0.1
dropout: 0.1
attention_dropout: 0.1
feature_grad_mult: 0.1
untie_final_proj: true
activation_dropout: 0.0
use_rel_pos_enc: true
add_code_encoder: true
add_adaptor: false
text_transformer:
activation_fn: ${model.activation_fn}
dropout: ${model.dropout}
attention_dropout: ${model.attention_dropout}
activation_dropout: ${model.activation_dropout}
adaptive_input: ${model.adaptive_input}
max_source_positions: 3000
checkpoint_activations: ${model.checkpoint_activations}
no_scale_embedding: false
layernorm_embedding: false
quant_noise:
pq: ${model.quant_noise_pq}
encoder:
embed_dim: 768
ffn_embed_dim: 3072
layers: 6
attention_heads: 8
normalize_before: false
learned_pos: true
layerdrop: ${model.encoder_layerdrop}
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
if [ ! -d ${HOME}/azcopy_linux_amd64_10.11.0 ]; then
CURRENT_DIR=`pwd`
cd ${HOME} && wget https://azcopyvnext.azureedge.net/release20210616/azcopy_linux_amd64_10.11.0.tar.gz && tar -zxvf azcopy_linux_amd64_10.11.0.tar.gz && rm -f azcopy_linux_amd64_10.11.0.tar.gz && cd ${CURRENT_DIR}
fi
export PATH=$PATH:${HOME}/azcopy_linux_amd64_10.11.0/:${HOME}/.local/bin
export PYTHONPATH=$PYTHONPATH:/mnt/output/users/v-kunwei/code/fairseq
rank=$1
nshard=$2
split=$3
[ -z $rank ] && echo "please specify rank"
[ -z $nshard ] && nshard=1
[ -z $split ] && split="train"
FAIRSEQ_ROOT=/mnt/output/users/v-kunwei/code/fairseq
ckpt_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3.pt
tsv_dir=/home/v-kunwei
feat_dir=${HOME}/$split
python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} 9 ${nshard} ${rank} ${feat_dir} || exit 1
echo "-------------------------------------------------------------------------------------------"
echo "---------------------------------- done ---------------------------------------------"
echo "-------------------------------------------------------------------------------------------"
km_path=/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin
lab_dir=${HOME}/${split}
python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
# sas="?sv=2020-08-04&st=2022-01-02T04%3A58%3A15Z&se=2022-06-01T04%3A58%3A00Z&sr=c&sp=racwdl&sig=NyZKOHivgesEoZ8yvLsVT6aZMYQZMevLLmXNOTaWyvU%3D"
# blob="https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-ziqzhang/data/stbert/data/librispeech/libri_960/hubert_release_iter2_layer9_kmeans/${split}"
# azcopy copy $feat_dir/${split}_${rank}_${nshard}.len "$blob/$sas"
# azcopy copy $feat_dir/${split}_${rank}_${nshard}.npy "$blob/$sas"
# azcopy copy $lab_dir "$blob/$sas" --recursive
import sys
import torch
def main():
for line in sys.stdin:
line = line.rstrip()
codes = list(map(int, line.split()))
merged_codes = torch.unique_consecutive(torch.tensor(codes)).numpy()
merged_codes = map(str, merged_codes)
print(" ".join(merged_codes))
if __name__ == "__main__":
main()
[ $# -lt 3 ] && echo "Usage: $0 <input-text> <outdir> <DICT> <suffix>" && exit 0
if [ ! -d ${HOME}/sentencepiece ]; then
CURRENT_DIR=`pwd`
cd ${HOME}
git clone https://github.com/google/sentencepiece.git
cd sentencepiece
mkdir build && cd build
cmake .. && make -j 16
sudo make install
sudo ldconfig -v
cd ${HOME}
cd ${CURRENT_DIR}
fi
input=$1
outdir=$2
DICT=$3
suffix=$4
outname=${input##*/}
outname=${outname%.txt*}
[ -z $input ] && echo "You must specify a source file" && exit 1
[ -z $DICT ] && echo "No dict was specified!" && exit 1
[ -z $outdir ] && outdir=${input%/*}
[ -z $outdir ] && outdir="."
[ ! -d $outdir ] && mkdir -p $outdir
echo "Dict : $DICT"
echo "------------------------------- creating idx/bin--------------------------------------------"
echo "$input --> $outdir/${outname}${suffix}.idx"
fairseq-preprocess \
--only-source \
--trainpref $input \
--destdir $outdir \
--thresholdsrc 0 \
--srcdict ${DICT} \
--workers 40
mv $outdir/train.idx $outdir/${outname}${suffix}.idx
mv $outdir/train.bin $outdir/${outname}${suffix}.bin
echo "----------------------------------- done --------------------------------------------"
[ $# -lt 2 ] && echo "Usage: $0 <input-text> <outdir> <MODEL> <suffix>" && exit 0
if [ ! -d ${HOME}/sentencepiece ]; then
CURRENT_DIR=`pwd`
cd ${HOME}
git clone https://github.com/google/sentencepiece.git
cd sentencepiece
mkdir build && cd build
cmake .. && make -j 16
sudo make install
sudo ldconfig -v
cd ${HOME}
cd ${CURRENT_DIR}
fi
input=$1
outdir=$2
MODEL=$3
suffix=$4
outname=${input##*/}
outname=${outname%.wrd*}
[ -z $input ] && echo "You must specify a source file" && exit 1
[ -z $MODEL ] && MODEL=/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/spm_unigram_10000.model && echo "No spm model was specified!, set default to $MODEL"
[ -z $outdir ] && outdir=${input%/*}
[ -z $outdir ] && outdir="."
[ ! -d $outdir ] && mkdir -p $outdir
echo "Output: $outdir/$outname.spm"
echo "------------------------------- tokenize text...--------------------------------------------"
spm_encode --model=$MODEL < ${input} > $outdir/$outname.spm || exit 1
echo "----------------------------------- done --------------------------------------------"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment