Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
SpeechT5_pytorch
Commits
12c90639
Commit
12c90639
authored
Sep 28, 2024
by
“change”
Browse files
init
parent
417b607b
Changes
350
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1299 additions
and
0 deletions
+1299
-0
Speech2S/speech2s/scripts/sacrebleu.sh
Speech2S/speech2s/scripts/sacrebleu.sh
+27
-0
Speech2S/speech2s/scripts/shard_docs.py
Speech2S/speech2s/scripts/shard_docs.py
+54
-0
Speech2S/speech2s/scripts/split_train_valid_docs.py
Speech2S/speech2s/scripts/split_train_valid_docs.py
+86
-0
Speech2S/speech2s/scripts/spm_decode.py
Speech2S/speech2s/scripts/spm_decode.py
+53
-0
Speech2S/speech2s/scripts/spm_encode.py
Speech2S/speech2s/scripts/spm_encode.py
+119
-0
Speech2S/speech2s/scripts/spm_train.py
Speech2S/speech2s/scripts/spm_train.py
+16
-0
Speech2S/speech2s/scripts/test_fsdp.sh
Speech2S/speech2s/scripts/test_fsdp.sh
+24
-0
Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh
Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh
+64
-0
Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh
Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh
+64
-0
Speech2S/speech2s/stpretrain_scripts/config.yaml
Speech2S/speech2s/stpretrain_scripts/config.yaml
+4
-0
Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/base_100h.yaml
...h2s/stpretrain_scripts/config/finetune_asr/base_100h.yaml
+101
-0
Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml
...2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml
+98
-0
Speech2S/speech2s/stpretrain_scripts/config/pretrain/mbart.yaml
...2S/speech2s/stpretrain_scripts/config/pretrain/mbart.yaml
+120
-0
Speech2S/speech2s/stpretrain_scripts/config/pretrain/sc2t_base_librispeech.yaml
...etrain_scripts/config/pretrain/sc2t_base_librispeech.yaml
+137
-0
Speech2S/speech2s/stpretrain_scripts/config/translation/text2code.yaml
...ch2s/stpretrain_scripts/config/translation/text2code.yaml
+81
-0
Speech2S/speech2s/stpretrain_scripts/config_mbart.yaml
Speech2S/speech2s/stpretrain_scripts/config_mbart.yaml
+120
-0
Speech2S/speech2s/stpretrain_scripts/data_process/extract_hubert_feature_itp.sh
...etrain_scripts/data_process/extract_hubert_feature_itp.sh
+41
-0
Speech2S/speech2s/stpretrain_scripts/data_process/merge_code.py
...2S/speech2s/stpretrain_scripts/data_process/merge_code.py
+14
-0
Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh
Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh
+43
-0
Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh
Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh
+33
-0
No files found.
Too many changes to show.
To preserve performance only
350 of 350+
files are displayed.
Plain diff
Email patch
Speech2S/speech2s/scripts/sacrebleu.sh
0 → 100644
View file @
12c90639
#!/bin/bash
if
[
$#
-ne
4
]
;
then
echo
"usage:
$0
TESTSET SRCLANG TGTLANG GEN"
exit
1
fi
TESTSET
=
$1
SRCLANG
=
$2
TGTLANG
=
$3
GEN
=
$4
if
!
command
-v
sacremoses &> /dev/null
then
echo
"sacremoses could not be found, please install with: pip install sacremoses"
exit
fi
grep
^H
$GEN
\
|
sed
's/^H\-//'
\
|
sort
-n
-k
1
\
|
cut
-f
3
\
| sacremoses detokenize
\
>
$GEN
.sorted.detok
sacrebleu
--test-set
$TESTSET
--language-pair
"
${
SRCLANG
}
-
${
TGTLANG
}
"
<
$GEN
.sorted.detok
Speech2S/speech2s/scripts/shard_docs.py
0 → 100644
View file @
12c90639
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Split a large file into shards while respecting document boundaries. Documents
should be separated by a single empty line.
"""
import
argparse
import
contextlib
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"input"
)
parser
.
add_argument
(
"--num-shards"
,
type
=
int
)
args
=
parser
.
parse_args
()
assert
args
.
num_shards
is
not
None
and
args
.
num_shards
>
1
with
open
(
args
.
input
,
"r"
,
encoding
=
"utf-8"
)
as
h
:
with
contextlib
.
ExitStack
()
as
stack
:
outputs
=
[
stack
.
enter_context
(
open
(
args
.
input
+
".shard"
+
str
(
i
),
"w"
,
encoding
=
"utf-8"
)
)
for
i
in
range
(
args
.
num_shards
)
]
doc
=
[]
first_doc
=
[
True
]
*
args
.
num_shards
def
output_doc
(
i
):
if
not
first_doc
[
i
]:
outputs
[
i
].
write
(
"
\n
"
)
first_doc
[
i
]
=
False
for
line
in
doc
:
outputs
[
i
].
write
(
line
)
doc
.
clear
()
num_docs
=
0
for
line
in
h
:
if
line
.
strip
()
==
""
:
# empty line indicates new document
output_doc
(
num_docs
%
args
.
num_shards
)
num_docs
+=
1
else
:
doc
.
append
(
line
)
output_doc
(
num_docs
%
args
.
num_shards
)
if
__name__
==
"__main__"
:
main
()
Speech2S/speech2s/scripts/split_train_valid_docs.py
0 → 100644
View file @
12c90639
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Split a large file into a train and valid set while respecting document
boundaries. Documents should be separated by a single empty line.
"""
import
argparse
import
random
import
sys
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"input"
)
parser
.
add_argument
(
"sample_output"
,
help
=
"train output file"
)
parser
.
add_argument
(
"remainder_output"
,
help
=
"valid output file"
)
parser
.
add_argument
(
"-k"
,
type
=
int
,
help
=
"remainder size"
)
parser
.
add_argument
(
"--lines"
,
action
=
"store_true"
,
help
=
"split lines instead of docs"
)
args
=
parser
.
parse_args
()
assert
args
.
k
is
not
None
sample
=
[]
remainder
=
[]
num_docs
=
[
0
]
def
update_sample
(
doc
):
if
len
(
sample
)
<
args
.
k
:
sample
.
append
(
doc
.
copy
())
else
:
i
=
num_docs
[
0
]
j
=
random
.
randrange
(
i
+
1
)
if
j
<
args
.
k
:
remainder
.
append
(
sample
[
j
])
sample
[
j
]
=
doc
.
copy
()
else
:
remainder
.
append
(
doc
.
copy
())
num_docs
[
0
]
+=
1
doc
.
clear
()
with
open
(
args
.
input
,
"r"
,
encoding
=
"utf-8"
)
as
h
:
doc
=
[]
for
i
,
line
in
enumerate
(
h
):
if
line
.
strip
()
==
""
:
# empty line indicates new document
update_sample
(
doc
)
else
:
doc
.
append
(
line
)
if
args
.
lines
:
update_sample
(
doc
)
if
i
%
1000000
==
0
:
print
(
i
,
file
=
sys
.
stderr
,
end
=
""
,
flush
=
True
)
elif
i
%
100000
==
0
:
print
(
"."
,
file
=
sys
.
stderr
,
end
=
""
,
flush
=
True
)
if
len
(
doc
)
>
0
:
update_sample
(
doc
)
print
(
file
=
sys
.
stderr
,
flush
=
True
)
assert
len
(
sample
)
==
args
.
k
with
open
(
args
.
sample_output
,
"w"
,
encoding
=
"utf-8"
)
as
out
:
first
=
True
for
doc
in
sample
:
if
not
first
and
not
args
.
lines
:
out
.
write
(
"
\n
"
)
first
=
False
for
line
in
doc
:
out
.
write
(
line
)
with
open
(
args
.
remainder_output
,
"w"
,
encoding
=
"utf-8"
)
as
out
:
first
=
True
for
doc
in
remainder
:
if
not
first
and
not
args
.
lines
:
out
.
write
(
"
\n
"
)
first
=
False
for
line
in
doc
:
out
.
write
(
line
)
if
__name__
==
"__main__"
:
main
()
Speech2S/speech2s/scripts/spm_decode.py
0 → 100644
View file @
12c90639
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
sentencepiece
as
spm
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
required
=
True
,
help
=
"sentencepiece model to use for decoding"
)
parser
.
add_argument
(
"--input"
,
required
=
True
,
help
=
"input file to decode"
)
parser
.
add_argument
(
"--input_format"
,
choices
=
[
"piece"
,
"id"
],
default
=
"piece"
)
args
=
parser
.
parse_args
()
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
model
)
if
args
.
input_format
==
"piece"
:
def
decode
(
input
):
return
""
.
join
(
sp
.
DecodePieces
(
input
))
elif
args
.
input_format
==
"id"
:
def
decode
(
input
):
return
""
.
join
(
sp
.
DecodeIds
(
input
))
else
:
raise
NotImplementedError
def
tok2int
(
tok
):
# remap reference-side <unk> (represented as <<unk>>) to 0
return
int
(
tok
)
if
tok
!=
"<<unk>>"
else
0
with
open
(
args
.
input
,
"r"
,
encoding
=
"utf-8"
)
as
h
:
for
line
in
h
:
if
args
.
input_format
==
"id"
:
print
(
decode
(
list
(
map
(
tok2int
,
line
.
rstrip
().
split
()))))
elif
args
.
input_format
==
"piece"
:
print
(
decode
(
line
.
rstrip
().
split
()))
if
__name__
==
"__main__"
:
main
()
Speech2S/speech2s/scripts/spm_encode.py
0 → 100644
View file @
12c90639
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
contextlib
import
sys
import
sentencepiece
as
spm
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
required
=
True
,
help
=
"sentencepiece model to use for encoding"
)
parser
.
add_argument
(
"--inputs"
,
nargs
=
"+"
,
default
=
[
"-"
],
help
=
"input files to filter/encode"
)
parser
.
add_argument
(
"--outputs"
,
nargs
=
"+"
,
default
=
[
"-"
],
help
=
"path to save encoded outputs"
)
parser
.
add_argument
(
"--output_format"
,
choices
=
[
"piece"
,
"id"
],
default
=
"piece"
)
parser
.
add_argument
(
"--min-len"
,
type
=
int
,
metavar
=
"N"
,
help
=
"filter sentence pairs with fewer than N tokens"
,
)
parser
.
add_argument
(
"--max-len"
,
type
=
int
,
metavar
=
"N"
,
help
=
"filter sentence pairs with more than N tokens"
,
)
args
=
parser
.
parse_args
()
assert
len
(
args
.
inputs
)
==
len
(
args
.
outputs
),
"number of input and output paths should match"
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
model
)
if
args
.
output_format
==
"piece"
:
def
encode
(
input
):
return
sp
.
EncodeAsPieces
(
input
)
elif
args
.
output_format
==
"id"
:
def
encode
(
input
):
return
list
(
map
(
str
,
sp
.
EncodeAsIds
(
input
)))
else
:
raise
NotImplementedError
if
args
.
min_len
is
not
None
or
args
.
max_len
is
not
None
:
def
valid
(
line
):
return
(
args
.
min_len
is
None
or
len
(
line
)
>=
args
.
min_len
)
and
(
args
.
max_len
is
None
or
len
(
line
)
<=
args
.
max_len
)
else
:
def
valid
(
lines
):
return
True
with
contextlib
.
ExitStack
()
as
stack
:
inputs
=
[
stack
.
enter_context
(
open
(
input
,
"r"
,
encoding
=
"utf-8"
))
if
input
!=
"-"
else
sys
.
stdin
for
input
in
args
.
inputs
]
outputs
=
[
stack
.
enter_context
(
open
(
output
,
"w"
,
encoding
=
"utf-8"
))
if
output
!=
"-"
else
sys
.
stdout
for
output
in
args
.
outputs
]
stats
=
{
"num_empty"
:
0
,
"num_filtered"
:
0
,
}
def
encode_line
(
line
):
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
line
=
encode
(
line
)
if
valid
(
line
):
return
line
else
:
stats
[
"num_filtered"
]
+=
1
else
:
stats
[
"num_empty"
]
+=
1
return
None
for
i
,
lines
in
enumerate
(
zip
(
*
inputs
),
start
=
1
):
enc_lines
=
list
(
map
(
encode_line
,
lines
))
if
not
any
(
enc_line
is
None
for
enc_line
in
enc_lines
):
for
enc_line
,
output_h
in
zip
(
enc_lines
,
outputs
):
print
(
" "
.
join
(
enc_line
),
file
=
output_h
)
if
i
%
10000
==
0
:
print
(
"processed {} lines"
.
format
(
i
),
file
=
sys
.
stderr
)
print
(
"skipped {} empty lines"
.
format
(
stats
[
"num_empty"
]),
file
=
sys
.
stderr
)
print
(
"filtered {} lines"
.
format
(
stats
[
"num_filtered"
]),
file
=
sys
.
stderr
)
if
__name__
==
"__main__"
:
main
()
Speech2S/speech2s/scripts/spm_train.py
0 → 100644
View file @
12c90639
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
sys
import
sentencepiece
as
spm
if
__name__
==
"__main__"
:
spm
.
SentencePieceTrainer
.
Train
(
" "
.
join
(
sys
.
argv
[
1
:]))
Speech2S/speech2s/scripts/test_fsdp.sh
0 → 100644
View file @
12c90639
#!/usr/bin/env bash
rm
-rf
fsdp_dummy
mkdir
-p
fsdp_dummy
CUDA_VISIBLE_DEVICES
=
0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap
\
--ddp-backend
fully_sharded
--fp16
--fp16-init-scale
4
\
--cpu-offload
--checkpoint-activations
\
--task
language_modeling
--tokens-per-sample
256
--batch-size
8
\
--arch
transformer_lm_gpt2_tiny
\
--optimizer
cpu_adam
--adam-betas
"(0.9,0.98)"
\
--lr
0.0001
--lr-scheduler
polynomial_decay
--warmup-updates
5
--total-num-update
10
\
--max-update
5
--log-format
json
--log-interval
1
\
--save-interval-updates
5
--save-dir
fsdp_dummy
--disable-validation
\
--restore-file
x.pt
"
$@
"
# Now we try to load the checkpoint
CUDA_VISIBLE_DEVICES
=
0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap
\
--ddp-backend
fully_sharded
--fp16
--fp16-init-scale
4
\
--cpu-offload
--checkpoint-activations
\
--task
language_modeling
--tokens-per-sample
256
--batch-size
8
\
--arch
transformer_lm_gpt2_tiny
\
--optimizer
cpu_adam
--adam-betas
"(0.9,0.98)"
\
--lr
0.0001
--lr-scheduler
polynomial_decay
--warmup-updates
5
--total-num-update
10
\
--max-update
2
--log-format
json
--log-interval
1
\
--save-interval-updates
2
--save-dir
fsdp_dummy
Speech2S/speech2s/stpretrain_scripts/base_sc2c_enes.sh
0 → 100644
View file @
12c90639
# ####################################
# Hubert SCT2T ED model #
# ####################################
world_size
=
$1
update_freq
=
$2
exp_name
=
$3
[
-z
$world_size
]
&&
world_size
=
8
[
-z
$update_freq
]
&&
update_freq
=
1
[
-z
$exp_name
]
&&
exp_name
=
sc2t_base_enes_
${
world_size
}
gpu_
${
update_freq
}
accum6666
FAIRSEQ_ROOT
=
/mnt/output/users/v-kunwei/code/fairseq_mlstku
CONFIG_DIR
=
/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
DATA_DIR
=
"/mnt/output/users/v-kunwei/data/s2s_data/speech_enes"
TEXT_DATA_DIR
=
"/mnt/output/users/v-kunwei/data/s2s_data/text_enes/bin-idx"
MODEL_DIR
=
"/mnt/output/v-kunwei/data/s2s_data/exp/S2S_enes/
$exp_name
"
[
-d
$MODEL_DIR
]
||
mkdir
-p
$MODEL_DIR
python
$FAIRSEQ_ROOT
/fairseq_cli/hydra_train.py
\
--config-dir
$CONFIG_DIR
/pretrain
\
--config-name
sc2t_base_librispeech
\
\
+task.store_labels
=
true
\
task.labels
=
'["km"]'
\
model.label_rate
=
50
\
task.data
=
$DATA_DIR
\
task.label_dir
=
$DATA_DIR
\
task.text_cfg.text_data
=
$TEXT_DATA_DIR
\
+task.text_cfg.data_config
=
config.yaml
\
task.text_cfg.text_maxtokens_ratio
=
3.0
\
\
+criterion.dec_loss_type
=
"ce"
\
\
criterion.text_weight
=
1.0
\
\
model.use_rel_pos_enc
=
true
\
+model.code_use_rel_pos_enc
=
true
\
+model.pad_with_code
=
true
\
model.text_transformer.no_scale_embedding
=
true
\
model.text_transformer.layernorm_embedding
=
true
\
+model.share_decoder_input_output_embed
=
true
\
\
dataset.train_subset
=
\"
train_all+en.kmu-spm
\"
\
dataset.valid_subset
=
\"
valid+en_valid.kmu-spm
\"
\
dataset.num_workers
=
0
\
dataset.max_tokens
=
1000000
\
optimization.update_freq
=[
${
update_freq
}
]
\
optimization.max_update
=
400000
\
\
distributed_training.distributed_world_size
=
${
world_size
}
\
\
common.tensorboard_logdir
=
$MODEL_DIR
\
checkpoint.save_dir
=
$MODEL_DIR
\
hydra.run.dir
=
$MODEL_DIR
\
hydra.job.name
=
${
exp_name
}
sleep
5m
echo
"All finished"
Speech2S/speech2s/stpretrain_scripts/base_sc2c_esen.sh
0 → 100644
View file @
12c90639
# ####################################
# Hubert SCT2T ED model #
# ####################################
world_size
=
$1
update_freq
=
$2
exp_name
=
$3
[
-z
$world_size
]
&&
world_size
=
24
[
-z
$update_freq
]
&&
update_freq
=
3
[
-z
$exp_name
]
&&
exp_name
=
sc2t_base_esen_
${
world_size
}
gpu_
${
update_freq
}
accum1
FAIRSEQ_ROOT
=
/mnt/output/users/v-kunwei/code/fairseq_mlstku
CONFIG_DIR
=
/mnt/output/users/v-kunwei/code/stpretrain_scripts/config
DATA_DIR
=
"/mnt/output/users/v-kunwei/data/s2s_data/speech_esen"
TEXT_DATA_DIR
=
"/mnt/output/users/v-kunwei/data/s2s_data/text_esen"
MODEL_DIR
=
"/mnt/output/v-kunwei/data/s2s_data/exp/S2S_esen/
$exp_name
"
[
-d
$MODEL_DIR
]
||
mkdir
-p
$MODEL_DIR
python
$FAIRSEQ_ROOT
/fairseq_cli/hydra_train.py
\
--config-dir
$CONFIG_DIR
/pretrain
\
--config-name
sc2t_base_librispeech
\
\
+task.store_labels
=
true
\
task.labels
=
'["km"]'
\
model.label_rate
=
50
\
task.data
=
$DATA_DIR
\
task.label_dir
=
$DATA_DIR
\
task.text_cfg.text_data
=
$TEXT_DATA_DIR
\
+task.text_cfg.data_config
=
config.yaml
\
task.text_cfg.text_maxtokens_ratio
=
3.0
\
\
+criterion.dec_loss_type
=
"ce"
\
\
criterion.text_weight
=
1.0
\
\
model.use_rel_pos_enc
=
true
\
+model.code_use_rel_pos_enc
=
true
\
+model.pad_with_code
=
true
\
model.text_transformer.no_scale_embedding
=
true
\
model.text_transformer.layernorm_embedding
=
true
\
+model.share_decoder_input_output_embed
=
true
\
\
dataset.train_subset
=
\"
train+en.kmu-spm
\"
\
dataset.valid_subset
=
\"
valid+en_valid.kmu-spm
\"
\
dataset.num_workers
=
0
\
dataset.max_tokens
=
1000000
\
optimization.update_freq
=[
${
update_freq
}
]
\
optimization.max_update
=
400000
\
\
distributed_training.distributed_world_size
=
${
world_size
}
\
\
common.tensorboard_logdir
=
$MODEL_DIR
\
checkpoint.save_dir
=
$MODEL_DIR
\
hydra.run.dir
=
$MODEL_DIR
\
hydra.job.name
=
${
exp_name
}
sleep
5m
echo
"All finished"
Speech2S/speech2s/stpretrain_scripts/config.yaml
0 → 100644
View file @
12c90639
audio_root
:
./
standardize_audio
:
true
use_audio_input
:
true
vocab_filename
:
dict.txt
Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/base_100h.yaml
0 → 100644
View file @
12c90639
# @package _group_
common
:
fp16
:
true
log_format
:
json
log_interval
:
200
tensorboard_logdir
:
tblog
seed
:
1337
checkpoint
:
save_interval
:
1
keep_last_epochs
:
5
keep_best_checkpoints
:
5
best_checkpoint_metric
:
wer
restore_file
:
checkpoint_last.pt
distributed_training
:
ddp_backend
:
c10d
find_unused_parameters
:
true
distributed_world_size
:
1
distributed_port
:
-1
nprocs_per_node
:
8
task
:
_name
:
hubert_pretraining
data
:
???
fine_tuning
:
true
label_dir
:
???
normalize
:
false
# must be consistent with pre-training
labels
:
[
"
ltr"
]
single_target
:
true
add_decoder
:
false
pad_audio
:
false
random_crop
:
true
tokenizer
:
"
none"
sp_path
:
None
dataset
:
num_workers
:
0
max_tokens
:
1200000
skip_invalid_size_inputs_valid_test
:
true
train_subset
:
train_100
valid_subset
:
dev_other
required_batch_size_multiple
:
1
criterion
:
_name
:
label_smoothed_cross_entropy
#zero_infinity: true
optimization
:
max_update
:
80000
lr
:
[
0.00003
]
sentence_avg
:
true
update_freq
:
[
1
]
optimizer
:
_name
:
adam
adam_betas
:
(0.9,0.98)
adam_eps
:
1e-08
weight_decay
:
0.0
lr_scheduler
:
_name
:
tri_stage
phase_ratio
:
[
0.1
,
0.4
,
0.5
]
final_lr_scale
:
0.05
model
:
_name
:
hubert_ctc
w2v_path
:
???
apply_mask
:
true
mask_prob
:
0.65
mask_channel_prob
:
0.5
mask_channel_length
:
64
layerdrop
:
0.1
decoder_layerdrop
:
0.1
activation_dropout
:
0.1
feature_grad_mult
:
0.0
freeze_finetune_updates
:
0
add_decoder
:
false
hydra
:
job
:
config
:
override_dirname
:
kv_sep
:
'
-'
item_sep
:
'
__'
exclude_keys
:
-
run
-
task.data
-
task.label_dir
-
model.w2v_path
-
dataset.train_subset
-
dataset.valid_subset
-
criterion.wer_kenlm_model
-
criterion.wer_lexicon
run
:
dir
:
???
sweep
:
dir
:
???
subdir
:
${hydra.job.config_name}__${hydra.job.override_dirname}
Speech2S/speech2s/stpretrain_scripts/config/finetune_asr/large_960h.yaml
0 → 100644
View file @
12c90639
# @package _group_
common
:
fp16
:
true
log_format
:
json
log_interval
:
200
tensorboard_logdir
:
tblog
checkpoint
:
save_interval
:
1
keep_last_epochs
:
10
keep_best_checkpoints
:
5
best_checkpoint_metric
:
wer
restore_file
:
checkpoint_last.pt
distributed_training
:
ddp_backend
:
c10d
find_unused_parameters
:
true
distributed_world_size
:
24
distributed_port
:
-1
nprocs_per_node
:
8
task
:
_name
:
hubert_pretraining
data
:
???
fine_tuning
:
true
label_dir
:
???
normalize
:
true
# must be consistent with pre-training
labels
:
[
"
ltr"
]
single_target
:
true
add_decoder
:
false
pad_audio
:
false
random_crop
:
true
tokenizer
:
"
none"
sp_path
:
None
dataset
:
num_workers
:
0
max_tokens
:
1280000
skip_invalid_size_inputs_valid_test
:
true
valid_subset
:
dev_other
required_batch_size_multiple
:
1
criterion
:
_name
:
ctc
zero_infinity
:
true
optimization
:
max_update
:
200000
lr
:
[
0.00003
]
sentence_avg
:
true
update_freq
:
[
1
]
optimizer
:
_name
:
adam
adam_betas
:
(0.9,0.98)
adam_eps
:
1e-08
weight_decay
:
0.0
lr_scheduler
:
_name
:
tri_stage
phase_ratio
:
[
0.1
,
0.4
,
0.5
]
final_lr_scale
:
0.05
model
:
_name
:
hubert_ctc
w2v_path
:
???
apply_mask
:
true
mask_prob
:
0.5
mask_channel_prob
:
0.25
mask_channel_length
:
64
layerdrop
:
0.0
decoder_layerdrop
:
0.1
activation_dropout
:
0.1
feature_grad_mult
:
0.0
freeze_finetune_updates
:
0
add_decoder
:
false
hydra
:
job
:
config
:
override_dirname
:
kv_sep
:
'
-'
item_sep
:
'
__'
exclude_keys
:
-
run
-
task.data
-
task.label_dir
-
model.w2v_path
-
dataset.train_subset
-
dataset.valid_subset
-
criterion.wer_kenlm_model
-
criterion.wer_lexicon
run
:
dir
:
???
sweep
:
dir
:
???
subdir
:
${hydra.job.config_name}__${hydra.job.override_dirname}
Speech2S/speech2s/stpretrain_scripts/config/pretrain/mbart.yaml
0 → 100644
View file @
12c90639
# @package _group_
common
:
fp16
:
true
log_format
:
json
log_interval
:
200
seed
:
1337
tensorboard_logdir
:
tblog
checkpoint
:
save_dir
:
???
save_interval
:
4
keep_last_epochs
:
4
save_interval_updates
:
20000
keep_interval_updates
:
-1
keep_interval_updates_pattern
:
50000
# no_epoch_checkpoints: true
distributed_training
:
ddp_backend
:
no_c10d
distributed_backend
:
'
nccl'
distributed_world_size
:
8
nprocs_per_node
:
8
find_unused_parameters
:
true
task
:
_name
:
denoising
data
:
???
mask
:
0.15
dataset
:
num_workers
:
6
max_tokens
:
1400000
skip_invalid_size_inputs_valid_test
:
true
validate_interval
:
${checkpoint.save_interval}
validate_interval_updates
:
${checkpoint.save_interval_updates}
required_batch_size_multiple
:
1
criterion
:
_name
:
sc2t
pred_masked_weight
:
1.0
pred_nomask_weight
:
0.0
loss_weights
:
[
10
,]
label_smoothing
:
0.1
text_weight
:
0.1
optimization
:
max_update
:
400000
lr
:
[
0.0005
]
clip_norm
:
10.0
optimizer
:
_name
:
adam
adam_betas
:
(0.9,0.98)
adam_eps
:
1e-06
weight_decay
:
0.01
lr_scheduler
:
_name
:
polynomial_decay
warmup_updates
:
32000
model
:
_name
:
stbert
label_rate
:
???
skip_masked
:
false
skip_nomask
:
false
mask_prob
:
0.80
extractor_mode
:
default
conv_feature_layers
:
'
[(512,10,5)]
+
[(512,3,2)]
*
4
+
[(512,2,2)]
*
2'
final_dim
:
256
encoder_layers
:
6
encoder_attention_heads
:
8
decoder_layerdrop
:
0.05
dropout_input
:
0.1
dropout_features
:
0.1
dropout
:
0.1
attention_dropout
:
0.1
feature_grad_mult
:
0.1
untie_final_proj
:
true
activation_dropout
:
0.0
use_rel_pos_enc
:
true
add_code_encoder
:
true
add_adaptor
:
false
text_transformer
:
activation_fn
:
${model.activation_fn}
dropout
:
${model.dropout}
attention_dropout
:
${model.attention_dropout}
activation_dropout
:
${model.activation_dropout}
adaptive_input
:
${model.adaptive_input}
max_source_positions
:
3000
checkpoint_activations
:
${model.checkpoint_activations}
no_scale_embedding
:
false
layernorm_embedding
:
false
quant_noise
:
pq
:
${model.quant_noise_pq}
encoder
:
embed_dim
:
768
ffn_embed_dim
:
3072
layers
:
6
attention_heads
:
8
normalize_before
:
false
learned_pos
:
true
layerdrop
:
${model.encoder_layerdrop}
hydra
:
job
:
config
:
override_dirname
:
kv_sep
:
'
-'
item_sep
:
'
__'
exclude_keys
:
-
run
-
task.data
-
task.label_dir
run
:
dir
:
???
sweep
:
dir
:
???
subdir
:
${hydra.job.config_name}__${hydra.job.override_dirname}
Speech2S/speech2s/stpretrain_scripts/config/pretrain/sc2t_base_librispeech.yaml
0 → 100644
View file @
12c90639
# @package _group_
common
:
fp16
:
true
log_format
:
json
log_interval
:
200
seed
:
1337
tensorboard_logdir
:
tblog
checkpoint
:
save_dir
:
???
save_interval
:
4
keep_last_epochs
:
4
save_interval_updates
:
20000
keep_interval_updates
:
-1
keep_interval_updates_pattern
:
50000
# no_epoch_checkpoints: true
distributed_training
:
ddp_backend
:
no_c10d
distributed_backend
:
'
nccl'
distributed_world_size
:
8
nprocs_per_node
:
8
find_unused_parameters
:
true
task
:
_name
:
joint_sc2t_pretraining
data
:
???
label_dir
:
???
labels
:
???
label_rate
:
${model.label_rate}
sample_rate
:
16000
max_sample_size
:
250000
min_sample_size
:
32000
pad_audio
:
false
random_crop
:
true
normalize
:
false
# must be consistent with extractor
add_decoder
:
true
text_cfg
:
seed
:
${common.seed}
text_data
:
???
sample_break_mode
:
eos
tokens_per_sample
:
1024
shorten_method
:
"
random_crop"
text_maxtokens_ratio
:
1.0
dataset
:
num_workers
:
6
max_tokens
:
1400000
skip_invalid_size_inputs_valid_test
:
true
validate_interval
:
${checkpoint.save_interval}
validate_interval_updates
:
${checkpoint.save_interval_updates}
required_batch_size_multiple
:
1
criterion
:
_name
:
sc2t
pred_masked_weight
:
1.0
pred_nomask_weight
:
0.0
loss_weights
:
[
10
,]
label_smoothing
:
0.1
text_weight
:
0.1
optimization
:
max_update
:
400000
lr
:
[
0.0005
]
clip_norm
:
10.0
optimizer
:
_name
:
adam
adam_betas
:
(0.9,0.98)
adam_eps
:
1e-06
weight_decay
:
0.01
lr_scheduler
:
_name
:
polynomial_decay
warmup_updates
:
32000
model
:
_name
:
stbert
label_rate
:
???
skip_masked
:
false
skip_nomask
:
false
mask_prob
:
0.80
extractor_mode
:
default
conv_feature_layers
:
'
[(512,10,5)]
+
[(512,3,2)]
*
4
+
[(512,2,2)]
*
2'
final_dim
:
256
encoder_layers
:
6
encoder_attention_heads
:
8
decoder_layerdrop
:
0.05
dropout_input
:
0.1
dropout_features
:
0.1
dropout
:
0.1
attention_dropout
:
0.1
feature_grad_mult
:
0.1
untie_final_proj
:
true
activation_dropout
:
0.0
use_rel_pos_enc
:
true
add_code_encoder
:
true
add_adaptor
:
false
text_transformer
:
activation_fn
:
${model.activation_fn}
dropout
:
${model.dropout}
attention_dropout
:
${model.attention_dropout}
activation_dropout
:
${model.activation_dropout}
adaptive_input
:
${model.adaptive_input}
max_source_positions
:
3000
checkpoint_activations
:
${model.checkpoint_activations}
no_scale_embedding
:
false
layernorm_embedding
:
false
quant_noise
:
pq
:
${model.quant_noise_pq}
encoder
:
embed_dim
:
768
ffn_embed_dim
:
3072
layers
:
6
attention_heads
:
8
normalize_before
:
false
learned_pos
:
true
layerdrop
:
${model.encoder_layerdrop}
hydra
:
job
:
config
:
override_dirname
:
kv_sep
:
'
-'
item_sep
:
'
__'
exclude_keys
:
-
run
-
task.data
-
task.label_dir
run
:
dir
:
???
sweep
:
dir
:
???
subdir
:
${hydra.job.config_name}__${hydra.job.override_dirname}
Speech2S/speech2s/stpretrain_scripts/config/translation/text2code.yaml
0 → 100644
View file @
12c90639
# @package _group_
common
:
fp16
:
true
log_format
:
json
log_interval
:
200
tensorboard_logdir
:
tblog
seed
:
1337
checkpoint
:
save_interval
:
1000000
keep_last_epochs
:
5
save_interval_updates
:
1000
keep_interval_updates_pattern
:
10000
keep_interval_updates
:
5
best_checkpoint_metric
:
accuracy
maximize_best_checkpoint_metric
:
true
distributed_training
:
ddp_backend
:
c10d
find_unused_parameters
:
true
distributed_world_size
:
1
nprocs_per_node
:
8
criterion
:
_name
:
"
label_smoothed_cross_entropy"
task
:
_name
:
"
translation_from_jst"
dataset
:
num_workers
:
0
max_tokens
:
4096
skip_invalid_size_inputs_valid_test
:
true
validate_after_updates
:
${model.freeze_finetune_updates}
validate_interval
:
${checkpoint.save_interval}
validate_interval_updates
:
${checkpoint.save_interval_updates}
train_subset
:
train_clean_100
valid_subset
:
dev_clean
required_batch_size_multiple
:
1
optimizer
:
_name
:
adam
adam_betas
:
(0.9,0.98)
adam_eps
:
1e-06
weight_decay
:
0.0
lr_scheduler
:
_name
:
tri_stage
phase_ratio
:
[
0.1
,
0.4
,
0.5
]
final_lr_scale
:
0.05
model
:
_name
:
hubert_t2c
w2v_path
:
???
layerdrop
:
0.1
decoder_layerdrop
:
0.1
activation_dropout
:
0.1
feature_grad_mult
:
0.0
freeze_finetune_updates
:
0
hydra
:
job
:
config
:
override_dirname
:
kv_sep
:
'
-'
item_sep
:
'
__'
exclude_keys
:
-
run
-
task.data
-
task.label_dir
-
model.w2v_path
-
dataset.train_subset
-
dataset.valid_subset
run
:
dir
:
???
sweep
:
dir
:
???
subdir
:
${hydra.job.config_name}__${hydra.job.override_dirname}
Speech2S/speech2s/stpretrain_scripts/config_mbart.yaml
0 → 100644
View file @
12c90639
# @package _group_
common
:
fp16
:
true
log_format
:
json
log_interval
:
200
seed
:
1337
tensorboard_logdir
:
tblog
checkpoint
:
save_dir
:
???
save_interval
:
4
keep_last_epochs
:
4
save_interval_updates
:
20000
keep_interval_updates
:
-1
keep_interval_updates_pattern
:
50000
# no_epoch_checkpoints: true
distributed_training
:
ddp_backend
:
no_c10d
distributed_backend
:
'
nccl'
distributed_world_size
:
8
nprocs_per_node
:
8
find_unused_parameters
:
true
task
:
_name
:
denoising
data
:
???
mask
:
0.15
dataset
:
num_workers
:
6
max_tokens
:
1400000
skip_invalid_size_inputs_valid_test
:
true
validate_interval
:
${checkpoint.save_interval}
validate_interval_updates
:
${checkpoint.save_interval_updates}
required_batch_size_multiple
:
1
criterion
:
_name
:
sc2t
pred_masked_weight
:
1.0
pred_nomask_weight
:
0.0
loss_weights
:
[
10
,]
label_smoothing
:
0.1
text_weight
:
0.1
optimization
:
max_update
:
400000
lr
:
[
0.0005
]
clip_norm
:
10.0
optimizer
:
_name
:
adam
adam_betas
:
(0.9,0.98)
adam_eps
:
1e-06
weight_decay
:
0.01
lr_scheduler
:
_name
:
polynomial_decay
warmup_updates
:
32000
model
:
_name
:
stbert
label_rate
:
???
skip_masked
:
false
skip_nomask
:
false
mask_prob
:
0.80
extractor_mode
:
default
conv_feature_layers
:
'
[(512,10,5)]
+
[(512,3,2)]
*
4
+
[(512,2,2)]
*
2'
final_dim
:
256
encoder_layers
:
6
encoder_attention_heads
:
8
decoder_layerdrop
:
0.05
dropout_input
:
0.1
dropout_features
:
0.1
dropout
:
0.1
attention_dropout
:
0.1
feature_grad_mult
:
0.1
untie_final_proj
:
true
activation_dropout
:
0.0
use_rel_pos_enc
:
true
add_code_encoder
:
true
add_adaptor
:
false
text_transformer
:
activation_fn
:
${model.activation_fn}
dropout
:
${model.dropout}
attention_dropout
:
${model.attention_dropout}
activation_dropout
:
${model.activation_dropout}
adaptive_input
:
${model.adaptive_input}
max_source_positions
:
3000
checkpoint_activations
:
${model.checkpoint_activations}
no_scale_embedding
:
false
layernorm_embedding
:
false
quant_noise
:
pq
:
${model.quant_noise_pq}
encoder
:
embed_dim
:
768
ffn_embed_dim
:
3072
layers
:
6
attention_heads
:
8
normalize_before
:
false
learned_pos
:
true
layerdrop
:
${model.encoder_layerdrop}
hydra
:
job
:
config
:
override_dirname
:
kv_sep
:
'
-'
item_sep
:
'
__'
exclude_keys
:
-
run
-
task.data
-
task.label_dir
run
:
dir
:
???
sweep
:
dir
:
???
subdir
:
${hydra.job.config_name}__${hydra.job.override_dirname}
Speech2S/speech2s/stpretrain_scripts/data_process/extract_hubert_feature_itp.sh
0 → 100644
View file @
12c90639
if
[
!
-d
${
HOME
}
/azcopy_linux_amd64_10.11.0
]
;
then
CURRENT_DIR
=
`
pwd
`
cd
${
HOME
}
&&
wget https://azcopyvnext.azureedge.net/release20210616/azcopy_linux_amd64_10.11.0.tar.gz
&&
tar
-zxvf
azcopy_linux_amd64_10.11.0.tar.gz
&&
rm
-f
azcopy_linux_amd64_10.11.0.tar.gz
&&
cd
${
CURRENT_DIR
}
fi
export
PATH
=
$PATH
:
${
HOME
}
/azcopy_linux_amd64_10.11.0/:
${
HOME
}
/.local/bin
export
PYTHONPATH
=
$PYTHONPATH
:/mnt/output/users/v-kunwei/code/fairseq
rank
=
$1
nshard
=
$2
split
=
$3
[
-z
$rank
]
&&
echo
"please specify rank"
[
-z
$nshard
]
&&
nshard
=
1
[
-z
$split
]
&&
split
=
"train"
FAIRSEQ_ROOT
=
/mnt/output/users/v-kunwei/code/fairseq
ckpt_path
=
/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3.pt
tsv_dir
=
/home/v-kunwei
feat_dir
=
${
HOME
}
/
$split
python
$FAIRSEQ_ROOT
/examples/hubert/simple_kmeans/dump_hubert_feature.py
${
tsv_dir
}
${
split
}
${
ckpt_path
}
9
${
nshard
}
${
rank
}
${
feat_dir
}
||
exit
1
echo
"-------------------------------------------------------------------------------------------"
echo
"---------------------------------- done ---------------------------------------------"
echo
"-------------------------------------------------------------------------------------------"
km_path
=
/mnt/output/users/v-kunwei/code/fairseq/examples/speech_to_speech/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin
lab_dir
=
${
HOME
}
/
${
split
}
python
$FAIRSEQ_ROOT
/examples/hubert/simple_kmeans/dump_km_label.py
${
feat_dir
}
${
split
}
${
km_path
}
${
nshard
}
${
rank
}
${
lab_dir
}
# sas="?sv=2020-08-04&st=2022-01-02T04%3A58%3A15Z&se=2022-06-01T04%3A58%3A00Z&sr=c&sp=racwdl&sig=NyZKOHivgesEoZ8yvLsVT6aZMYQZMevLLmXNOTaWyvU%3D"
# blob="https://msranlcmtteamdrive.blob.core.windows.net/teamdrive/v-ziqzhang/data/stbert/data/librispeech/libri_960/hubert_release_iter2_layer9_kmeans/${split}"
# azcopy copy $feat_dir/${split}_${rank}_${nshard}.len "$blob/$sas"
# azcopy copy $feat_dir/${split}_${rank}_${nshard}.npy "$blob/$sas"
# azcopy copy $lab_dir "$blob/$sas" --recursive
Speech2S/speech2s/stpretrain_scripts/data_process/merge_code.py
0 → 100644
View file @
12c90639
import
sys
import
torch
def
main
():
for
line
in
sys
.
stdin
:
line
=
line
.
rstrip
()
codes
=
list
(
map
(
int
,
line
.
split
()))
merged_codes
=
torch
.
unique_consecutive
(
torch
.
tensor
(
codes
)).
numpy
()
merged_codes
=
map
(
str
,
merged_codes
)
print
(
" "
.
join
(
merged_codes
))
if
__name__
==
"__main__"
:
main
()
Speech2S/speech2s/stpretrain_scripts/data_process/txt2idx.sh
0 → 100644
View file @
12c90639
[
$#
-lt
3
]
&&
echo
"Usage:
$0
<input-text> <outdir> <DICT> <suffix>"
&&
exit
0
if
[
!
-d
${
HOME
}
/sentencepiece
]
;
then
CURRENT_DIR
=
`
pwd
`
cd
${
HOME
}
git clone https://github.com/google/sentencepiece.git
cd
sentencepiece
mkdir
build
&&
cd
build
cmake ..
&&
make
-j
16
sudo
make
install
sudo
ldconfig
-v
cd
${
HOME
}
cd
${
CURRENT_DIR
}
fi
input
=
$1
outdir
=
$2
DICT
=
$3
suffix
=
$4
outname
=
${
input
##*/
}
outname
=
${
outname
%.txt*
}
[
-z
$input
]
&&
echo
"You must specify a source file"
&&
exit
1
[
-z
$DICT
]
&&
echo
"No dict was specified!"
&&
exit
1
[
-z
$outdir
]
&&
outdir
=
${
input
%/*
}
[
-z
$outdir
]
&&
outdir
=
"."
[
!
-d
$outdir
]
&&
mkdir
-p
$outdir
echo
"Dict :
$DICT
"
echo
"------------------------------- creating idx/bin--------------------------------------------"
echo
"
$input
-->
$outdir
/
${
outname
}${
suffix
}
.idx"
fairseq-preprocess
\
--only-source
\
--trainpref
$input
\
--destdir
$outdir
\
--thresholdsrc
0
\
--srcdict
${
DICT
}
\
--workers
40
mv
$outdir
/train.idx
$outdir
/
${
outname
}${
suffix
}
.idx
mv
$outdir
/train.bin
$outdir
/
${
outname
}${
suffix
}
.bin
echo
"----------------------------------- done --------------------------------------------"
Speech2S/speech2s/stpretrain_scripts/data_process/txt2spm.sh
0 → 100644
View file @
12c90639
[
$#
-lt
2
]
&&
echo
"Usage:
$0
<input-text> <outdir> <MODEL> <suffix>"
&&
exit
0
if
[
!
-d
${
HOME
}
/sentencepiece
]
;
then
CURRENT_DIR
=
`
pwd
`
cd
${
HOME
}
git clone https://github.com/google/sentencepiece.git
cd
sentencepiece
mkdir
build
&&
cd
build
cmake ..
&&
make
-j
16
sudo
make
install
sudo
ldconfig
-v
cd
${
HOME
}
cd
${
CURRENT_DIR
}
fi
input
=
$1
outdir
=
$2
MODEL
=
$3
suffix
=
$4
outname
=
${
input
##*/
}
outname
=
${
outname
%.wrd*
}
[
-z
$input
]
&&
echo
"You must specify a source file"
&&
exit
1
[
-z
$MODEL
]
&&
MODEL
=
/mnt/default/v-ziqzhang/data/stbert/data/librispeech/hubert_release_iter2_layer9_kmeans/spm_unigram_10000.model
&&
echo
"No spm model was specified!, set default to
$MODEL
"
[
-z
$outdir
]
&&
outdir
=
${
input
%/*
}
[
-z
$outdir
]
&&
outdir
=
"."
[
!
-d
$outdir
]
&&
mkdir
-p
$outdir
echo
"Output:
$outdir
/
$outname
.spm"
echo
"------------------------------- tokenize text...--------------------------------------------"
spm_encode
--model
=
$MODEL
<
${
input
}
>
$outdir
/
$outname
.spm
||
exit
1
echo
"----------------------------------- done --------------------------------------------"
Prev
1
2
3
4
5
6
7
8
9
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment