Commit c394d7d1 authored by “change”'s avatar “change”
Browse files

init

parents
# @package _global_
task:
label_dir: ???
labels: ["km"]
model:
label_rate: 50
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 32
distributed_port: 29671
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: hubert_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: false # must be consistent with extractor
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
validate_interval_updates: 10000
criterion:
_name: hubert
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
optimization:
max_update: 400000
lr: [0.0005]
clip_norm: 10.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: hubert
label_rate: ???
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: default
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
final_dim: 256
encoder_layerdrop: 0.05
dropout_input: 0.1
dropout_features: 0.1
dropout: 0.1
attention_dropout: 0.1
feature_grad_mult: 0.1
untie_final_proj: true
activation_dropout: 0.0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 128
distributed_port: 29671
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: hubert_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: true # must be consistent with extractor
dataset:
num_workers: 6
max_tokens: 900000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
validate_interval_updates: 10000
criterion:
_name: hubert
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
optimization:
max_update: 400000
lr: [0.0015]
clip_norm: 1.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: hubert
label_rate: ???
encoder_layers: 24
encoder_embed_dim: 1024
encoder_ffn_embed_dim: 4096
encoder_attention_heads: 16
final_dim: 768
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: layer_norm
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
encoder_layerdrop: 0.0
dropout_input: 0.0
dropout_features: 0.0
dropout: 0.0
attention_dropout: 0.0
layer_norm_first: true
feature_grad_mult: 1.0
untie_final_proj: true
activation_dropout: 0.0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
run:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
sweep:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
distributed_training:
ddp_backend: no_c10d
distributed_backend: 'nccl'
distributed_world_size: 256
distributed_port: 29671
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: hubert_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: true # must be consistent with extractor
dataset:
num_workers: 6
max_tokens: 360000
skip_invalid_size_inputs_valid_test: true
validate_interval: 5
validate_interval_updates: 10000
criterion:
_name: hubert
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
optimization:
max_update: 400000
lr: [0.003]
clip_norm: 1.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: hubert
label_rate: ???
encoder_layers: 48
encoder_embed_dim: 1280
encoder_ffn_embed_dim: 5120
encoder_attention_heads: 16
final_dim: 1024
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: layer_norm
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
encoder_layerdrop: 0.0
dropout_input: 0.0
dropout_features: 0.0
dropout: 0.0
attention_dropout: 0.0
layer_norm_first: true
feature_grad_mult: 1.0
untie_final_proj: true
activation_dropout: 0.0
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
run:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
sweep:
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
# @package _global_
hydra:
launcher:
cpus_per_task: 8
gpus_per_node: 8
tasks_per_node: ${hydra.launcher.gpus_per_node}
nodes: 4
comment: null
mem_gb: 384
timeout_min: 4320
max_num_timeout: 100
constraint: volta32gb
name: ${hydra.job.config_name}/${hydra.job.override_dirname}
submitit_folder: ${hydra.sweep.dir}/submitit/%j
distributed_training:
distributed_world_size: 32
distributed_port: 29671
nprocs_per_node: 8
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import os.path as op
import re
from tabulate import tabulate
from collections import Counter
def comp_purity(p_xy, axis):
max_p = p_xy.max(axis=axis)
marg_p = p_xy.sum(axis=axis)
indv_pur = max_p / marg_p
aggr_pur = max_p.sum()
return indv_pur, aggr_pur
def comp_entropy(p):
return (-p * np.log(p + 1e-8)).sum()
def comp_norm_mutual_info(p_xy):
p_x = p_xy.sum(axis=1, keepdims=True)
p_y = p_xy.sum(axis=0, keepdims=True)
pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8)
mi = (p_xy * pmi).sum()
h_x = comp_entropy(p_x)
h_y = comp_entropy(p_y)
return mi, mi / h_x, mi / h_y, h_x, h_y
def pad(labs, n):
if n == 0:
return np.array(labs)
return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n])
def comp_avg_seg_dur(labs_list):
n_frms = 0
n_segs = 0
for labs in labs_list:
labs = np.array(labs)
edges = np.zeros(len(labs)).astype(bool)
edges[0] = True
edges[1:] = labs[1:] != labs[:-1]
n_frms += len(edges)
n_segs += edges.astype(int).sum()
return n_frms / n_segs
def comp_joint_prob(uid2refs, uid2hyps):
"""
Args:
pad: padding for spliced-feature derived labels
"""
cnts = Counter()
skipped = []
abs_frmdiff = 0
for uid in uid2refs:
if uid not in uid2hyps:
skipped.append(uid)
continue
refs = uid2refs[uid]
hyps = uid2hyps[uid]
abs_frmdiff += abs(len(refs) - len(hyps))
min_len = min(len(refs), len(hyps))
refs = refs[:min_len]
hyps = hyps[:min_len]
cnts.update(zip(refs, hyps))
tot = sum(cnts.values())
ref_set = sorted({ref for ref, _ in cnts.keys()})
hyp_set = sorted({hyp for _, hyp in cnts.keys()})
ref2pid = dict(zip(ref_set, range(len(ref_set))))
hyp2lid = dict(zip(hyp_set, range(len(hyp_set))))
# print(hyp_set)
p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float)
for (ref, hyp), cnt in cnts.items():
p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt
p_xy /= p_xy.sum()
return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped
def read_phn(tsv_path, rm_stress=True):
uid2phns = {}
with open(tsv_path) as f:
for line in f:
uid, phns = line.rstrip().split("\t")
phns = phns.split(",")
if rm_stress:
phns = [re.sub("[0-9]", "", phn) for phn in phns]
uid2phns[uid] = phns
return uid2phns
def read_lab(tsv_path, lab_path, pad_len=0, upsample=1):
"""
tsv is needed to retrieve the uids for the labels
"""
with open(tsv_path) as f:
f.readline()
uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f]
with open(lab_path) as f:
labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f]
assert len(uids) == len(labs_list)
return dict(zip(uids, labs_list))
def main_lab_lab(
tsv_dir,
lab_dir,
lab_name,
lab_sets,
ref_dir,
ref_name,
pad_len=0,
upsample=1,
verbose=False,
):
# assume tsv_dir is the same for both the reference and the hypotheses
tsv_dir = lab_dir if tsv_dir is None else tsv_dir
uid2refs = {}
for s in lab_sets:
uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}"))
uid2hyps = {}
for s in lab_sets:
uid2hyps.update(
read_lab(
f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
)
)
_main(uid2refs, uid2hyps, verbose)
def main_phn_lab(
tsv_dir,
lab_dir,
lab_name,
lab_sets,
phn_dir,
phn_sets,
pad_len=0,
upsample=1,
verbose=False,
):
uid2refs = {}
for s in phn_sets:
uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv"))
uid2hyps = {}
tsv_dir = lab_dir if tsv_dir is None else tsv_dir
for s in lab_sets:
uid2hyps.update(
read_lab(
f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample
)
)
_main(uid2refs, uid2hyps, verbose)
def _main(uid2refs, uid2hyps, verbose):
(p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob(
uid2refs, uid2hyps
)
ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0)
hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1)
(mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy)
outputs = {
"ref pur": ref_pur,
"hyp pur": hyp_pur,
"H(ref)": h_ref,
"H(hyp)": h_hyp,
"MI": mi,
"MI/H(ref)": mi_norm_by_ref,
"ref segL": comp_avg_seg_dur(uid2refs.values()),
"hyp segL": comp_avg_seg_dur(uid2hyps.values()),
"p_xy shape": p_xy.shape,
"frm tot": tot,
"frm diff": frmdiff,
"utt tot": len(uid2refs),
"utt miss": len(skipped),
}
print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f"))
if __name__ == "__main__":
"""
compute quality of labels with respect to phone or another labels if set
"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("lab_dir")
parser.add_argument("lab_name")
parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+")
parser.add_argument(
"--phn_dir",
default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1",
)
parser.add_argument(
"--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+"
)
parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses")
parser.add_argument(
"--upsample", default=1, type=int, help="upsample factor for hypotheses"
)
parser.add_argument("--ref_lab_dir", default="")
parser.add_argument("--ref_lab_name", default="")
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
if args.ref_lab_dir and args.ref_lab_name:
main_lab_lab(
args.tsv_dir,
args.lab_dir,
args.lab_name,
args.lab_sets,
args.ref_lab_dir,
args.ref_lab_name,
args.pad_len,
args.upsample,
args.verbose,
)
else:
main_phn_lab(
args.tsv_dir,
args.lab_dir,
args.lab_name,
args.lab_sets,
args.phn_dir,
args.phn_sets,
args.pad_len,
args.upsample,
args.verbose,
)
# Sharded Feature Extraction and K-means Application
This folder contains scripts for preparing HUBERT labels from tsv files, the
steps are:
1. feature extraction
2. k-means clustering
3. k-means application
## Data preparation
`*.tsv` files contains a list of audio, where each line is the root, and
following lines are the subpath for each audio:
```
<root-dir>
<audio-path-1>
<audio-path-2>
...
```
## Feature extraction
### MFCC feature
Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D
mfcc+delta+ddelta features for the 1st iteration HUBERT training, run:
```sh
python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir}
```
This would shard the tsv file into `${nshard}` and extract features for the
`${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would
be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
### HUBERT feature
To extract features from the `${layer}`-th transformer layer of a trained
HUBERT model saved at `${ckpt_path}`, run:
```sh
python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir}
```
Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`.
- if out-of-memory, decrease the chunk size with `--max_chunk`
## K-means clustering
To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run
```sh
python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1
```
This saves the k-means model to `${km_path}`.
- set `--precent -1` to use all data
- more kmeans options can be found with `-h` flag
## K-means application
To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run
```sh
python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir}
```
This would extract labels for the `${rank}`-th shard out of `${nshard}` shards
and dump them to `${lab_dir}/${split}_${rank}_${shard}.km`
Finally, merge shards for `${split}` by running
```sh
for rank in $(seq 0 $((nshard - 1))); do
cat $lab_dir/${split}_${rank}_${nshard}.km
done > $lab_dir/${split}.km
```
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import math
import os
import sys
import fairseq
import soundfile as sf
import torch
import torch.nn.functional as F
import tqdm
from npy_append_array import NpyAppendArray
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_hubert_feature")
class HubertFeatureReader(object):
def __init__(self, ckpt_path, layer, max_chunk=1600000):
(
model,
cfg,
task,
) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
self.model = model[0].eval().cuda()
self.task = task
self.layer = layer
self.max_chunk = max_chunk
logger.info(f"TASK CONFIG:\n{self.task.cfg}")
logger.info(f" max_chunk = {self.max_chunk}")
def read_audio(self, path, ref_len=None):
wav, sr = sf.read(path)
assert sr == self.task.cfg.sample_rate, sr
if wav.ndim == 2:
wav = wav.mean(-1)
assert wav.ndim == 1, wav.ndim
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_feats(self, path, ref_len=None):
x = self.read_audio(path, ref_len)
with torch.no_grad():
x = torch.from_numpy(x).float().cuda()
if self.task.cfg.normalize:
x = F.layer_norm(x, x.shape)
x = x.view(1, -1)
feat = []
for start in range(0, x.size(1), self.max_chunk):
x_chunk = x[:, start: start + self.max_chunk]
feat_chunk, _ = self.model.extract_features(
source=x_chunk,
padding_mask=None,
mask=False,
output_layer=self.layer,
)
feat.append(feat_chunk)
return torch.cat(feat, 1).squeeze(0)
def get_path_iterator(tsv, nshard, rank):
with open(tsv, "r") as f:
root = f.readline().rstrip()
lines = [line.rstrip() for line in f]
tot = len(lines)
shard_size = math.ceil(tot / nshard)
start, end = rank * shard_size, min((rank + 1) * shard_size, tot)
assert start < end, "start={start}, end={end}"
logger.info(
f"rank {rank} of {nshard}, process {end-start} "
f"({start}-{end}) out of {tot}"
)
lines = lines[start:end]
def iterate():
for line in lines:
subpath, nsample = line.split("\t")
yield f"{root}/{subpath}", int(nsample)
return iterate, len(lines)
def dump_feature(
tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk
):
reader = HubertFeatureReader(ckpt_path, layer, max_chunk)
generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
iterator = generator()
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
os.makedirs(feat_dir, exist_ok=True)
if os.path.exists(feat_path):
os.remove(feat_path)
feat_f = NpyAppendArray(feat_path)
with open(leng_path, "w") as leng_f:
for path, nsample in tqdm.tqdm(iterator, total=num):
feat = reader.get_feats(path, nsample)
feat_f.append(feat.cpu().numpy())
leng_f.write(f"{len(feat)}\n")
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("split")
parser.add_argument("ckpt_path")
parser.add_argument("layer", type=int)
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("--max_chunk", type=int, default=1600000)
args = parser.parse_args()
logger.info(args)
dump_feature(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import csv
import io
import logging
import math
import os
import os.path as op
import sys
import tqdm
from dump_hubert_feature import HubertFeatureReader
from fairseq.data.audio.audio_utils import get_waveform
from fairseq.data.audio.speech_to_text_dataset import (
read_from_uncompressed_zip,
)
from npy_append_array import NpyAppendArray
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_hubert_feature_s2t")
class HubertFeatureReaderS2T(HubertFeatureReader):
def read_audio(self, path, ref_len=None):
path, *extra = path.split(":")
assert len(extra) == 2
assert path.endswith(".zip")
data = read_from_uncompressed_zip(path, int(extra[0]), int(extra[1]))
f = io.BytesIO(data)
wav, sr = get_waveform(f)
assert sr == self.task.cfg.sample_rate, sr
if wav.ndim == 2:
wav = wav.mean(-1)
assert wav.ndim == 1, wav.ndim
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_path_iterator(root, tsv, nshard, rank):
with open(tsv) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
subpaths = [op.join(root, e["audio"]) for e in reader]
tot = len(subpaths)
shard_size = math.ceil(tot / nshard)
start, end = rank * shard_size, min((rank + 1) * shard_size, tot)
assert start < end, "start={start}, end={end}"
logger.info(
f"rank {rank} of {nshard}, process {end-start} "
f"({start}-{end}) out of {tot}"
)
subpaths = subpaths[start:end]
def iterate():
for subpath in subpaths:
yield op.join(root, subpath)
return iterate, len(subpaths)
def dump_feature(
root,
tsv_path,
ckpt_path,
layer,
nshard,
rank,
feat_dir,
feat_name,
max_chunk,
):
reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk)
generator, num = get_path_iterator(root, tsv_path, nshard, rank)
iterator = generator()
feat_path = f"{feat_dir}/{feat_name}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{feat_name}_{rank}_{nshard}.len"
os.makedirs(feat_dir, exist_ok=True)
if op.exists(feat_path):
os.remove(feat_path)
feat_f = NpyAppendArray(feat_path)
with open(leng_path, "w") as leng_f:
for path in tqdm.tqdm(iterator, total=num):
feat = reader.get_feats(path)
feat_f.append(feat.cpu().numpy())
leng_f.write(f"{len(feat)}\n")
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("root")
parser.add_argument("tsv_path")
parser.add_argument("ckpt_path")
parser.add_argument("layer", type=int)
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("feat_name")
parser.add_argument("--max_chunk", type=int, default=1600000)
args = parser.parse_args()
logger.info(args)
dump_feature(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import numpy as np
import joblib
import torch
import tqdm
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_km_label")
class ApplyKmeans(object):
def __init__(self, km_path):
self.km_model = joblib.load(km_path)
self.C_np = self.km_model.cluster_centers_.transpose()
self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)
self.C = torch.from_numpy(self.C_np)
self.Cnorm = torch.from_numpy(self.Cnorm_np)
if torch.cuda.is_available():
self.C = self.C.cuda()
self.Cnorm = self.Cnorm.cuda()
def __call__(self, x):
if isinstance(x, torch.Tensor):
dist = (
x.pow(2).sum(1, keepdim=True)
- 2 * torch.matmul(x, self.C)
+ self.Cnorm
)
return dist.argmin(dim=1).cpu().numpy()
else:
dist = (
(x ** 2).sum(1, keepdims=True)
- 2 * np.matmul(x, self.C_np)
+ self.Cnorm_np
)
return np.argmin(dist, axis=1)
def get_feat_iterator(feat_dir, split, nshard, rank):
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
with open(leng_path, "r") as f:
lengs = [int(line.rstrip()) for line in f]
offsets = [0] + np.cumsum(lengs[:-1]).tolist()
def iterate():
feat = np.load(feat_path, mmap_mode="r")
assert feat.shape[0] == (offsets[-1] + lengs[-1])
for offset, leng in zip(offsets, lengs):
yield feat[offset: offset + leng]
return iterate, len(lengs)
def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir):
apply_kmeans = ApplyKmeans(km_path)
generator, num = get_feat_iterator(feat_dir, split, nshard, rank)
iterator = generator()
lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km"
os.makedirs(lab_dir, exist_ok=True)
with open(lab_path, "w") as f:
for feat in tqdm.tqdm(iterator, total=num):
# feat = torch.from_numpy(feat).cuda()
lab = apply_kmeans(feat).tolist()
f.write(" ".join(map(str, lab)) + "\n")
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("feat_dir")
parser.add_argument("split")
parser.add_argument("km_path")
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("lab_dir")
args = parser.parse_args()
logging.info(str(args))
dump_label(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import math
import os
import sys
import soundfile as sf
import torch
import torchaudio
import tqdm
from npy_append_array import NpyAppendArray
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("dump_mfcc_feature")
class MfccFeatureReader(object):
def __init__(self, sample_rate):
self.sample_rate = sample_rate
def read_audio(self, path, ref_len=None):
wav, sr = sf.read(path)
assert sr == self.sample_rate, sr
if wav.ndim == 2:
wav = wav.mean(-1)
assert wav.ndim == 1, wav.ndim
if ref_len is not None and abs(ref_len - len(wav)) > 160:
logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
return wav
def get_feats(self, path, ref_len=None):
x = self.read_audio(path, ref_len)
with torch.no_grad():
x = torch.from_numpy(x).float()
x = x.view(1, -1)
mfccs = torchaudio.compliance.kaldi.mfcc(
waveform=x,
sample_frequency=self.sample_rate,
use_energy=False,
) # (time, freq)
mfccs = mfccs.transpose(0, 1) # (freq, time)
deltas = torchaudio.functional.compute_deltas(mfccs)
ddeltas = torchaudio.functional.compute_deltas(deltas)
concat = torch.cat([mfccs, deltas, ddeltas], dim=0)
concat = concat.transpose(0, 1).contiguous() # (freq, time)
return concat
def get_path_iterator(tsv, nshard, rank):
with open(tsv, "r") as f:
root = f.readline().rstrip()
lines = [line.rstrip() for line in f]
tot = len(lines)
shard_size = math.ceil(tot / nshard)
start, end = rank * shard_size, min((rank + 1) * shard_size, tot)
assert start < end, "start={start}, end={end}"
logger.info(
f"rank {rank} of {nshard}, process {end-start} "
f"({start}-{end}) out of {tot}"
)
lines = lines[start:end]
def iterate():
for line in lines:
subpath, nsample = line.split("\t")
yield f"{root}/{subpath}", int(nsample)
return iterate, len(lines)
def dump_feature(tsv_dir, split, sample_rate, nshard, rank, feat_dir):
reader = MfccFeatureReader(sample_rate)
generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank)
iterator = generator()
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
os.makedirs(feat_dir, exist_ok=True)
if os.path.exists(feat_path):
os.remove(feat_path)
feat_f = NpyAppendArray(feat_path)
with open(leng_path, "w") as leng_f:
for path, nsample in tqdm.tqdm(iterator, total=num):
feat = reader.get_feats(path, nsample)
feat_f.append(feat.cpu().numpy())
leng_f.write(f"{len(feat)}\n")
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("tsv_dir")
parser.add_argument("split")
parser.add_argument("nshard", type=int)
parser.add_argument("rank", type=int)
parser.add_argument("feat_dir")
parser.add_argument("--sample_rate", type=int, default=16000)
args = parser.parse_args()
logger.info(args)
dump_feature(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import sys
import numpy as np
from sklearn.cluster import MiniBatchKMeans
import joblib
logging.basicConfig(
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
level=os.environ.get("LOGLEVEL", "INFO").upper(),
stream=sys.stdout,
)
logger = logging.getLogger("learn_kmeans")
def get_km_model(
n_clusters,
init,
max_iter,
batch_size,
tol,
max_no_improvement,
n_init,
reassignment_ratio,
):
return MiniBatchKMeans(
n_clusters=n_clusters,
init=init,
max_iter=max_iter,
batch_size=batch_size,
verbose=1,
compute_labels=False,
tol=tol,
max_no_improvement=max_no_improvement,
init_size=None,
n_init=n_init,
reassignment_ratio=reassignment_ratio,
)
def load_feature_shard(feat_dir, split, nshard, rank, percent):
feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy"
leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len"
with open(leng_path, "r") as f:
lengs = [int(line.rstrip()) for line in f]
offsets = [0] + np.cumsum(lengs[:-1]).tolist()
if percent < 0:
return np.load(feat_path, mmap_mode="r")
else:
nsample = int(np.ceil(len(lengs) * percent))
indices = np.random.choice(len(lengs), nsample, replace=False)
feat = np.load(feat_path, mmap_mode="r")
sampled_feat = np.concatenate(
[feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0
)
logger.info(
(
f"sampled {nsample} utterances, {len(sampled_feat)} frames "
f"from shard {rank}/{nshard}"
)
)
return sampled_feat
def load_feature(feat_dir, split, nshard, seed, percent):
assert percent <= 1.0
feat = np.concatenate(
[
load_feature_shard(feat_dir, split, nshard, r, percent)
for r in range(nshard)
],
axis=0,
)
logging.info(f"loaded feature with dimension {feat.shape}")
return feat
def learn_kmeans(
feat_dir,
split,
nshard,
km_path,
n_clusters,
seed,
percent,
init,
max_iter,
batch_size,
tol,
n_init,
reassignment_ratio,
max_no_improvement,
):
np.random.seed(seed)
feat = load_feature(feat_dir, split, nshard, seed, percent)
km_model = get_km_model(
n_clusters,
init,
max_iter,
batch_size,
tol,
max_no_improvement,
n_init,
reassignment_ratio,
)
km_model.fit(feat)
joblib.dump(km_model, km_path)
inertia = -km_model.score(feat) / len(feat)
logger.info("total intertia: %.5f", inertia)
logger.info("finished successfully")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("feat_dir", type=str)
parser.add_argument("split", type=str)
parser.add_argument("nshard", type=int)
parser.add_argument("km_path", type=str)
parser.add_argument("n_clusters", type=int)
parser.add_argument("--seed", default=0, type=int)
parser.add_argument(
"--percent", default=-1, type=float, help="sample a subset; -1 for all"
)
parser.add_argument("--init", default="k-means++")
parser.add_argument("--max_iter", default=100, type=int)
parser.add_argument("--batch_size", default=10000, type=int)
parser.add_argument("--tol", default=0.0, type=float)
parser.add_argument("--max_no_improvement", default=100, type=int)
parser.add_argument("--n_init", default=20, type=int)
parser.add_argument("--reassignment_ratio", default=0.0, type=float)
args = parser.parse_args()
logging.info(str(args))
learn_kmeans(**vars(args))
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch
src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt"
ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt"
new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt"
def update_state(state):
state["model"]["label_embs_concat"] = state["model"].pop("label_embs")
state["args"].task = "hubert_pretraining"
state["args"].labels = f"['{state['args'].labels}']"
return state
src_state = torch.load(src_ckpt)
src_state = update_state(src_state)
torch.save(src_state, new_ckpt)
# Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)
This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074).
## Training a joint alignment-translation model on WMT'18 En-De
##### 1. Extract and preprocess the WMT'18 En-De data
```bash
./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
```
##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign.
In this example, we use FastAlign.
```bash
git clone git@github.com:clab/fast_align.git
pushd fast_align
mkdir build
cd build
cmake ..
make
popd
ALIGN=fast_align/build/fast_align
paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de
$ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align
```
##### 3. Preprocess the dataset with the above generated alignments.
```bash
fairseq-preprocess \
--source-lang en --target-lang de \
--trainpref bpe.32k/train \
--validpref bpe.32k/valid \
--testpref bpe.32k/test \
--align-suffix align \
--destdir binarized/ \
--joined-dictionary \
--workers 32
```
##### 4. Train a model
```bash
fairseq-train \
binarized \
--arch transformer_wmt_en_de_big_align --share-all-embeddings \
--optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\
--lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
--max-tokens 3500 --label-smoothing 0.1 \
--save-dir ./checkpoints --log-interval 1000 --max-update 60000 \
--keep-interval-updates -1 --save-interval-updates 0 \
--load-alignments --criterion label_smoothed_cross_entropy_with_alignment \
--fp16
```
Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
If you want to train the above model with big batches (assuming your machine has 8 GPUs):
- add `--update-freq 8` to simulate training on 8x8=64 GPUs
- increase the learning rate; 0.0007 works well for big batches
##### 5. Evaluate and generate the alignments (BPE level)
```bash
fairseq-generate \
binarized --gen-subset test --print-alignment \
--source-lang en --target-lang de \
--path checkpoints/checkpoint_best.pt --beam 5 --nbest 1
```
##### 6. Other resources.
The code for:
1. preparing alignment test sets
2. converting BPE level alignments to token level alignments
3. symmetrizing bidirectional alignments
4. evaluating alignments using AER metric
can be found [here](https://github.com/lilt/alignment-scripts)
## Citation
```bibtex
@inproceedings{garg2019jointly,
title = {Jointly Learning to Align and Translate with Transformer Models},
author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias},
booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
address = {Hong Kong},
month = {November},
url = {https://arxiv.org/abs/1909.02074},
year = {2019},
}
```
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
echo 'Cloning Moses github repository (for tokenization scripts)...'
git clone https://github.com/moses-smt/mosesdecoder.git
SCRIPTS=mosesdecoder/scripts
TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
CLEAN=$SCRIPTS/training/clean-corpus-n.perl
REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
URLS=(
"http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
"http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
"http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
"http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
"http://data.statmt.org/wmt17/translation-task/dev.tgz"
"http://statmt.org/wmt14/test-full.tgz"
)
CORPORA=(
"training/europarl-v7.de-en"
"commoncrawl.de-en"
"training-parallel-nc-v13/news-commentary-v13.de-en"
"rapid2016.de-en"
)
if [ ! -d "$SCRIPTS" ]; then
echo "Please set SCRIPTS variable correctly to point to Moses scripts."
exit
fi
src=en
tgt=de
lang=en-de
prep=wmt18_en_de
tmp=$prep/tmp
orig=orig
dev=dev/newstest2012
codes=32000
bpe=bpe.32k
mkdir -p $orig $tmp $prep $bpe
cd $orig
for ((i=0;i<${#URLS[@]};++i)); do
url=${URLS[i]}
file=$(basename $url)
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit 1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
fi
fi
done
cd ..
echo "pre-processing train data..."
for l in $src $tgt; do
rm -rf $tmp/train.tags.$lang.tok.$l
for f in "${CORPORA[@]}"; do
cat $orig/$f.$l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
done
done
echo "pre-processing test data..."
for l in $src $tgt; do
if [ "$l" == "$src" ]; then
t="src"
else
t="ref"
fi
grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
sed -e 's/<seg id="[0-9]*">\s*//g' | \
sed -e 's/\s*<\/seg>\s*//g' | \
sed -e "s/\’/\'/g" | \
perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
echo ""
done
# apply length filtering before BPE
perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100
# use newstest2012 for valid
echo "pre-processing valid data..."
for l in $src $tgt; do
rm -rf $tmp/valid.$l
cat $orig/$dev.$l | \
perl $REM_NON_PRINT_CHAR | \
perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
done
mkdir output
mv $tmp/{train,valid,test}.{$src,$tgt} output
#BPE
git clone https://github.com/glample/fastBPE.git
pushd fastBPE
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
popd
fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done
# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
## Pre-trained models
Description | Parameters | Dataset | Model and Test set(s)
---|---:|---|---
Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
## Training an LM with adaptive inputs
First, see the general [language modeling README](README.md) for instructions on
preprocessing the WikiText-103 data.
Then use the following training command to train a model with adaptive inputs
using the `transformer_lm_wiki103` model architecture:
```bash
fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/transformer_wikitext-103 \
--arch transformer_lm_wiki103 \
--max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
--warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \
--criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
--sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp
```
## Citation
```bibtex
@inproceedings{
baevski2018adaptive,
title={Adaptive Input Representations for Neural Language Modeling},
author={Alexei Baevski and Michael Auli},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=ByxZX20qFQ},
}
```
# Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)
## Example usage
First download and preprocess the data following the main [language modeling README](README.md).
Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
architecture:
```bash
fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/fconv_wikitext-103 \
--arch fconv_lm_dauphin_wikitext103 \
--adaptive-softmax-cutoff 10000,20000,200000 \
--dropout 0.2 \
--criterion adaptive_loss \
--optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
--lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
--max-tokens 1024 --tokens-per-sample 1024 \
--ddp-backend legacy_ddp \
--max-epoch 35
```
And evaluate with:
```bash
fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
```
## Citation
```bibtex
@inproceedings{dauphin2017language,
title={Language Modeling with Gated Convolutional Networks},
author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
pages={933--941},
year={2017},
organization={JMLR}
}
```
# Neural Language Modeling
## Pre-trained models
Model | Description | Dataset | Download
---|---|---|---
`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
## Example usage
We require a few additional Python dependencies for preprocessing:
```bash
pip install fastBPE sacremoses
```
To sample from a language model using PyTorch Hub:
```python
import torch
# List available models
torch.hub.list('pytorch/fairseq') # [..., 'transformer_lm.wmt19.en', ...]
# Load an English LM trained on WMT'19 News Crawl data
en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
en_lm.eval() # disable dropout
# Move model to GPU
en_lm.cuda()
# Sample from the language model
en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
# "Barack Obama is coming to Sydney and New Zealand (...)"
# Compute perplexity for a sequence
en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp()
# tensor(15.1474)
# The same interface can be used with custom models as well
from fairseq.models.transformer_lm import TransformerLanguageModel
custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
custom_lm.sample('Barack Obama', beam=5)
# "Barack Obama (...)"
```
## Training a transformer language model with the CLI tools
### 1) Preprocess the data
First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
```bash
cd examples/language_model/
bash prepare-wikitext-103.sh
cd ../..
```
Next preprocess/binarize the data:
```bash
TEXT=examples/language_model/wikitext-103
fairseq-preprocess \
--only-source \
--trainpref $TEXT/wiki.train.tokens \
--validpref $TEXT/wiki.valid.tokens \
--testpref $TEXT/wiki.test.tokens \
--destdir data-bin/wikitext-103 \
--workers 20
```
### 2) Train a language model
Next we'll train a basic transformer language model on wikitext-103. For more
advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md).
To train a basic LM (assumes 2 GPUs):
```
$ fairseq-train --task language_modeling \
data-bin/wikitext-103 \
--save-dir checkpoints/transformer_wikitext-103 \
--arch transformer_lm --share-decoder-input-output-embed \
--dropout 0.1 \
--optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
--lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
--tokens-per-sample 512 --sample-break-mode none \
--max-tokens 2048 --update-freq 16 \
--fp16 \
--max-update 50000
```
If you run out of memory, try reducing `--max-tokens` (max number of tokens per
batch) or `--tokens-per-sample` (max sequence length). You can also adjust
`--update-freq` to accumulate gradients and simulate training on a different
number of GPUs.
### 3) Evaluate
```bash
fairseq-eval-lm data-bin/wikitext-103 \
--path checkpoints/transformer_wiki103/checkpoint_best.pt \
--batch-size 2 \
--tokens-per-sample 512 \
--context-window 400
# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
# | Loss: 3.4164, Perplexity: 30.46
```
*Note:* The `--context-window` option controls how much context is provided to
each token when computing perplexity. When the window size is 0, the dataset is
chunked into segments of length 512 and perplexity is computed over each segment
normally. However, this results in worse (higher) perplexity since tokens that
appear earlier in each segment have less conditioning. When the maximum window
size is used (511 in this case), then we compute perplexity for each token
fully conditioned on 511 tokens of context. This slows down evaluation
significantly, since we must run a separate forward pass for every token in the
dataset, but results in better (lower) perplexity.
## Convolutional language models
Please see the [convolutional LM README](README.conv.md) for instructions on
training convolutional language models.
#!/bin/bash
# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
URLS=(
"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
)
FILES=(
"wikitext-103-v1.zip"
)
for ((i=0;i<${#URLS[@]};++i)); do
file=${FILES[i]}
if [ -f $file ]; then
echo "$file already exists, skipping download"
else
url=${URLS[i]}
wget "$url"
if [ -f $file ]; then
echo "$url successfully downloaded."
else
echo "$url not successfully downloaded."
exit -1
fi
if [ ${file: -4} == ".tgz" ]; then
tar zxvf $file
elif [ ${file: -4} == ".tar" ]; then
tar xvf $file
elif [ ${file: -4} == ".zip" ]; then
unzip $file
fi
fi
done
cd ..
# LASER Language-Agnostic SEntence Representations
LASER is a library to calculate and use multilingual sentence embeddings.
You can find more information about LASER and how to use it on the official [LASER repository](https://github.com/facebookresearch/LASER).
This folder contains source code for training LASER embeddings.
## Prepare data and configuration file
Binarize your data with fairseq, as described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing).
Create a json config file with this format:
```
{
"src_vocab": "/path/to/spm.src.cvocab",
"tgt_vocab": "/path/to/spm.tgt.cvocab",
"train": [
{
"type": "translation",
"id": 0,
"src": "/path/to/srclang1-tgtlang0/train.srclang1",
"tgt": "/path/to/srclang1-tgtlang0/train.tgtlang0"
},
{
"type": "translation",
"id": 1,
"src": "/path/to/srclang1-tgtlang1/train.srclang1",
"tgt": "/path/to/srclang1-tgtlang1/train.tgtlang1"
},
{
"type": "translation",
"id": 0,
"src": "/path/to/srclang2-tgtlang0/train.srclang2",
"tgt": "/path/to/srclang2-tgtlang0/train.tgtlang0"
},
{
"type": "translation",
"id": 1,
"src": "/path/to/srclang2-tgtlang1/train.srclang2",
"tgt": "/path/to/srclang2-tgtlang1/train.tgtlang1"
},
...
],
"valid": [
{
"type": "translation",
"id": 0,
"src": "/unused",
"tgt": "/unused"
}
]
}
```
where paths are paths to binarized indexed fairseq dataset files.
`id` represents the target language id.
## Training Command Line Example
```
fairseq-train \
/path/to/configfile_described_above.json \
--user-dir examples/laser/laser_src \
--log-interval 100 --log-format simple \
--task laser --arch laser_lstm \
--save-dir . \
--optimizer adam \
--lr 0.001 \
--lr-scheduler inverse_sqrt \
--clip-norm 5 \
--warmup-updates 90000 \
--update-freq 2 \
--dropout 0.0 \
--encoder-dropout-out 0.1 \
--max-tokens 2000 \
--max-epoch 50 \
--encoder-bidirectional \
--encoder-layers 5 \
--encoder-hidden-size 512 \
--decoder-layers 1 \
--decoder-hidden-size 2048 \
--encoder-embed-dim 320 \
--decoder-embed-dim 320 \
--decoder-lang-embed-dim 32 \
--warmup-init-lr 0.001 \
--disable-validation
```
## Applications
We showcase several applications of multilingual sentence embeddings
with code to reproduce our results (in the directory "tasks").
* [**Cross-lingual document classification**](https://github.com/facebookresearch/LASER/tree/master/tasks/mldoc) using the
[*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6]
* [**WikiMatrix**](https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix)
Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7]
* [**Bitext mining**](https://github.com/facebookresearch/LASER/tree/master/tasks/bucc) using the
[*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5]
* [**Cross-lingual NLI**](https://github.com/facebookresearch/LASER/tree/master/tasks/xnli)
using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6]
* [**Multilingual similarity search**](https://github.com/facebookresearch/LASER/tree/master/tasks/similarity) [1,6]
* [**Sentence embedding of text files**](https://github.com/facebookresearch/LASER/tree/master/tasks/embed)
example how to calculate sentence embeddings for arbitrary text files in any of the supported language.
**For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.**
## References
[1] Holger Schwenk and Matthijs Douze,
[*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619),
ACL workshop on Representation Learning for NLP, 2017
[2] Holger Schwenk and Xian Li,
[*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf),
LREC, pages 3548-3551, 2018.
[3] Holger Schwenk,
[*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037)
ACL, July 2018
[4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov,
[*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269),
EMNLP, 2018.
[5] Mikel Artetxe and Holger Schwenk,
[*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136)
arXiv, Nov 3 2018.
[6] Mikel Artetxe and Holger Schwenk,
[*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464)
arXiv, Dec 26 2018.
[7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman,
[*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791)
arXiv, July 11 2019.
[8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin
[*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment