Initial commit

9e8a8c05 · jerrrrry · 9e8a8c05 · 9e8a8c05 · 9e8a8c05 · 9e8a8c05
Commit 9e8a8c05 authored Oct 14, 2024 by jerrrrry
20 changed files
--- a/implementations/pytorch/config_DGXA100_multi_2x8x6912.sh
+++ b/implementations/pytorch/config_DGXA100_multi_2x8x6912.sh
+## DL params
+export MAX_TOKENS=6912
+export LEARNING_RATE="1.9e-3"
+export WARMUP_UPDATES=750
+export EXTRA_PARAMS="--max-source-positions 64 --max-target-positions 64 --distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --adam-betas (0.9,0.98) "
+
+## System run parms
+export DGXNNODES=2
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=01:00:00
+
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2         # HT is on is 2, HT off is 1
+
+# Topology file for distributed optimizer
+export NCCL_TOPO_FILE=/workspace/translation/DGXA100-nic-affinity-minimal.xml
--- a/implementations/pytorch/config_DGXA100_multi_60x8x1536.sh
+++ b/implementations/pytorch/config_DGXA100_multi_60x8x1536.sh
+## DL params
+export MAX_TOKENS=1536
+export LEARNING_RATE="1.732e-3"
+export WARMUP_UPDATES=400
+export EXTRA_PARAMS="--distributed-weight-update 2 --dwu-num-blocks 4 --dwu-num-rs-pg 2 --dwu-num-ar-pg 2 --dwu-num-ag-pg 0 --dwu-overlap-reductions --dwu-num-chunks 1 --dwu-flat-mt --dwu-compute-L2-grad-norm --max-source-positions 76 --max-target-positions 76 --adam-betas (0.86,0.92) "
+
+## System run parms
+export DGXNNODES=60
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+export WALLTIME=00:20:00
+
+## System config params
+export DGXNGPU=8
+export DGXSOCKETCORES=64
+export DGXNSOCKET=2
+export DGXHT=2 	# HT is on is 2, HT off is 1
+
+# Topology file for distributed optimizer
+export NCCL_TOPO_FILE=/workspace/translation/DGXA100-nic-affinity-minimal.xml
--- a/implementations/pytorch/convert_utf8_to_fairseq_binary.py
+++ b/implementations/pytorch/convert_utf8_to_fairseq_binary.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import glob
+import os
+import random
+import sys
+import tarfile
+import urllib
+
+import six
+import urllib.request
+
+from fairseq.data import indexed_dataset
+from fairseq.data import dictionary
+from fairseq.tokenizer import MockTokenizer
+
+
+def make_binary_dataset(data_dir):
+    dict = dictionary.Dictionary.load(data_dir + '/dict.en.txt')
+
+    print('Converting utf8 files to fairseq binary')
+    files = glob.glob(data_dir + '/utf8/test*.en', recursive=True)
+    files += glob.glob(data_dir + '/utf8/test*.de', recursive=True)
+
+    files += glob.glob(data_dir + '/utf8/dev*.en', recursive=True)
+    files += glob.glob(data_dir + '/utf8/dev*.de', recursive=True)
+
+    files += glob.glob(data_dir + '/utf8/train*.en', recursive=True)
+    files += glob.glob(data_dir + '/utf8/train*.de', recursive=True)
+
+    def consumer(tensor):
+        ds.add_item(tensor)
+
+    for file in files:
+        print('Converting file:', file)
+        ds = indexed_dataset.IndexedDatasetBuilder(file + '.bin')
+
+        def consumer(tensor):
+            ds.add_item(tensor)
+
+        res = MockTokenizer.binarize(file, dict, consumer)
+
+        ds.finalize(file + '.idx')
+
+
+def main(unused_argv):
+  make_binary_dataset(FLAGS.data_dir)
+
+
+if __name__ == "__main__":
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--data_dir", "-dd", type=str, default="/research/transformer/processed_data",
+      help="[default: %(default)s] Directory for where the "
+           "translate_ende_wmt32k dataset is saved.",
+      metavar="<DD>")
+
+  FLAGS, unparsed = parser.parse_known_args()
+  main(sys.argv)
--- a/implementations/pytorch/distributed_fused_adam.py
+++ b/implementations/pytorch/distributed_fused_adam.py
--- a/implementations/pytorch/distributed_train.py
+++ b/implementations/pytorch/distributed_train.py
+#!/usr/bin/env python3 -u
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import os
+import socket
+import subprocess
+
+from train import main as single_process_main
+from fairseq import distributed_utils, options
+
+
+def main(args):
+    if args.distributed_init_method is None and args.distributed_port > 0:
+        # We can determine the init method automatically for Slurm.
+        node_list = os.environ.get('SLURM_JOB_NODELIST')
+        if node_list is not None:
+            try:
+                hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list])
+                args.distributed_init_method = 'tcp://{host}:{port}'.format(
+                    host=hostnames.split()[0].decode('utf-8'),
+                    port=args.distributed_port)
+                args.distributed_rank = int(os.environ.get('SLURM_PROCID'))
+                args.device_id = int(os.environ.get('SLURM_LOCALID'))
+            except subprocess.CalledProcessError as e:  # scontrol failed
+                raise e
+            except FileNotFoundError as e:  # Slurm is not installed
+                pass
+    if args.distributed_init_method is None:
+        raise ValueError('--distributed-init-method or --distributed-port '
+                         'must be specified for distributed training')
+
+    args.distributed_rank = distributed_utils.distributed_init(args)
+    args.device_id = args.local_rank
+    print('| initialized host {} as rank {} and device id {}'.format(socket.gethostname(), args.distributed_rank, args.device_id))
+    single_process_main(args)
+
+
+if __name__ == '__main__':
+    parser = options.get_training_parser()
+    args = options.parse_args_and_arch(parser)
+    main(args)
--- a/implementations/pytorch/eval_lm.py
+++ b/implementations/pytorch/eval_lm.py
+#!/usr/bin/env python3 -u
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import numpy as np
+import torch
+
+from fairseq import data, options, progress_bar, tasks, utils
+from fairseq.meters import StopwatchMeter, TimeMeter
+from fairseq.sequence_scorer import SequenceScorer
+
+
+def main(args):
+    assert args.path is not None, '--path required for evaluation!'
+
+    args.tokens_per_sample = getattr(args, 'tokens_per_sample', 1024)
+    print(args)
+
+    use_cuda = torch.cuda.is_available() and not args.cpu
+
+    # Load dataset splits
+    task = tasks.setup_task(args)
+    task.load_dataset(args.gen_subset)
+    print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset))))
+
+    # Load ensemble
+    print('| loading model(s) from {}'.format(args.path))
+    models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task)
+
+    # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
+    for model in models:
+        model.make_generation_fast_()
+        if args.fp16:
+            model.half()
+
+    assert len(models) > 0
+
+    itr = data.EpochBatchIterator(
+        dataset=task.dataset(args.gen_subset),
+        max_tokens=args.max_tokens or 36000,
+        max_sentences=args.max_sentences,
+        max_positions=models[0].max_positions(),
+        num_shards=args.num_shards,
+        shard_id=args.shard_id,
+        ignore_invalid_inputs=True,
+    ).next_epoch_itr(shuffle=False)
+
+    gen_timer = StopwatchMeter()
+    scorer = SequenceScorer(models, task.target_dictionary)
+    if use_cuda:
+        scorer.cuda()
+
+    score_sum = 0.
+    count = 0
+
+    if args.remove_bpe is not None:
+        bpe_cont = args.remove_bpe.rstrip()
+        bpe_toks = set(i for i in range(len(task.dictionary)) if task.dictionary[i].endswith(bpe_cont))
+        bpe_len = len(bpe_cont)
+    else:
+        bpe_toks = None
+        bpe_len = 0
+
+    with progress_bar.build_progress_bar(args, itr) as t:
+        results = scorer.score_batched_itr(t, cuda=use_cuda, timer=gen_timer)
+        wps_meter = TimeMeter()
+        for _, src_tokens, __, hypos in results:
+            for hypo in hypos:
+                pos_scores = hypo['positional_scores']
+
+                skipped_toks = 0
+                if bpe_toks is not None:
+                    for i in range(len(hypo['tokens']) - 1):
+                        if hypo['tokens'][i].item() in bpe_toks:
+                            skipped_toks += 1
+                            pos_scores[i + 1] += pos_scores[i]
+                            pos_scores[i] = 0
+
+                inf_scores = pos_scores.eq(float('inf')) | pos_scores.eq(float('-inf'))
+                if inf_scores.any():
+                    print('| Skipping tokens with inf scores:',
+                          task.target_dictionary.string(hypo['tokens'][inf_scores.nonzero()]))
+                    pos_scores = pos_scores[(~inf_scores).nonzero()]
+                score_sum += pos_scores.sum()
+                count += pos_scores.numel() - skipped_toks
+
+                if args.output_word_probs:
+                    w = ''
+                    word_prob = []
+                    for i in range(len(hypo['tokens'])):
+                        w_ind = hypo['tokens'][i].item()
+                        w += task.dictionary[w_ind]
+                        if bpe_toks is not None and w_ind in bpe_toks:
+                            w = w[:-bpe_len]
+                        else:
+                            word_prob.append((w, pos_scores[i].item()))
+                            w = ''
+                    print('\t'.join('{} [{:2f}]'.format(x[0], x[1]) for x in word_prob))
+
+            wps_meter.update(src_tokens.size(0))
+            t.log({'wps': round(wps_meter.avg)})
+
+    avg_nll_loss = -score_sum / count
+    print('| Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)'.format(gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
+    print('| Loss: {:.4f}, Perplexity: {:.2f}'.format(avg_nll_loss, np.exp(avg_nll_loss)))
+
+
+if __name__ == '__main__':
+    parser = options.get_eval_lm_parser()
+    args = options.parse_args_and_arch(parser)
+    main(args)
--- a/implementations/pytorch/examples/language_model/README.md
+++ b/implementations/pytorch/examples/language_model/README.md
+Sample data processing scripts for the FAIR Sequence-to-Sequence Toolkit
+
+These scripts provide an example of pre-processing data for the Language Modeling task.
+
+# prepare-wikitext-103.sh
+
+Provides an example of pre-processing for [WikiText-103 language modeling task](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset):
+
+Example usage:
+```
+$ cd examples/language_model/
+$ bash prepare-wikitext-103.sh
+$ cd ../..
+
+# Binarize the dataset:
+$ TEXT=examples/language_model/wikitext-103
+
+$ python preprocess.py --only-source \
+  --trainpref $TEXT/wiki.train.tokens --validpref $TEXT/wiki.valid.tokens --testpref $TEXT/wiki.test.tokens \ 
+  --destdir data-bin/wikitext-103
+
+# Train the model:
+# If it runs out of memory, try to reduce max-tokens and max-target-positions
+$ mkdir -p checkpoints/wikitext-103
+$ python train.py --task language_modeling data-bin/wikitext-103 \
+  --max-epoch 35 --arch fconv_lm_dauphin_wikitext103 --optimizer nag \
+  --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
+  --clip-norm 0.1 --dropout 0.2 --weight-decay 5e-06 --criterion adaptive_loss \
+  --adaptive-softmax-cutoff 10000,20000,200000 --max-tokens 1024 --tokens-per-sample 1024
+
+# Evaluate:
+$ python eval_lm.py data-bin/wikitext-103 --path 'checkpoints/wiki103/checkpoint_best.pt'
+
+```
--- a/implementations/pytorch/examples/language_model/prepare-wikitext-103.sh
+++ b/implementations/pytorch/examples/language_model/prepare-wikitext-103.sh
+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+
+URLS=(
+    "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
+)
+FILES=(
+    "wikitext-103-v1.zip"
+)
+
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        elif [ ${file: -4} == ".zip" ]; then
+            unzip $file
+        fi
+    fi
+done
+cd ..
--- a/implementations/pytorch/examples/stories/README.md
+++ b/implementations/pytorch/examples/stories/README.md
+FAIR Sequence-to-Sequence Toolkit for Story Generation
+
+The following commands provide an example of pre-processing data, training a model, and generating text for story generation with the WritingPrompts dataset.
+
+The dataset can be downloaded like this:
+
+```
+curl https://s3.amazonaws.com/fairseq-py/data/writingPrompts.tar.gz | tar xvzf -
+```
+
+and contains a train, test, and valid split. The dataset is described here: https://arxiv.org/abs/1805.04833. We model only the first 1000 words of each story, including one newLine token.
+
+
+Example usage:
+```
+# Binarize the dataset:
+$ TEXT=examples/stories/writingPrompts
+$ python preprocess.py --source-lang wp_source --target-lang wp_target \
+  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+  --destdir data-bin/writingPrompts --padding-factor 1 --thresholdtgt 10 --thresholdsrc 10
+
+# Train the model:
+$ python train.py data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau --decoder-attention True --encoder-attention False --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 --source-lang wp_source --target-lang wp_target --gated-attention True --self-attention True --project-input True --pretrained False
+
+# Train a fusion model:
+# add the arguments: --pretrained True --pretrained-checkpoint path/to/checkpoint
+
+# Generate:
+# Note: to load the pretrained model at generation time, you need to pass in a model-override argument to communicate to the fusion model at generation time where you have placed the pretrained checkpoint. By default, it will load the exact path of the fusion model's pretrained model from training time. You should use model-override if you have moved the pretrained model (or are using our provided models). If you are generating from a non-fusion model, the model-override argument is not necessary.
+
+$ python generate.py data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --sampling-temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}"
+```
--- a/implementations/pytorch/examples/translation/README.md
+++ b/implementations/pytorch/examples/translation/README.md
+# Example usage for Neural Machine Translation
+
+These scripts provide an example of pre-processing data for the NMT task
+and instructions for how to replicate the results from the paper [Scaling Neural Machine Translation (Ott et al., 2018)](https://arxiv.org/abs/1806.00187).
+
+## Preprocessing
+
+### prepare-iwslt14.sh
+
+Provides an example of pre-processing for IWSLT'14 German to English translation task: ["Report on the 11th IWSLT evaluation campaign" by Cettolo et al.](http://workshop2014.iwslt.org/downloads/proceeding.pdf)
+
+Example usage:
+```
+$ cd examples/translation/
+$ bash prepare-iwslt14.sh
+$ cd ../..
+
+# Binarize the dataset:
+$ TEXT=examples/translation/iwslt14.tokenized.de-en
+$ python preprocess.py --source-lang de --target-lang en \
+  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+  --destdir data-bin/iwslt14.tokenized.de-en
+
+# Train the model (better for a single GPU setup):
+$ mkdir -p checkpoints/fconv
+$ CUDA_VISIBLE_DEVICES=0 python train.py data-bin/iwslt14.tokenized.de-en \
+  --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+  --lr-scheduler fixed --force-anneal 200 \
+  --arch fconv_iwslt_de_en --save-dir checkpoints/fconv
+
+# Generate:
+$ python generate.py data-bin/iwslt14.tokenized.de-en \
+  --path checkpoints/fconv/checkpoint_best.pt \
+  --batch-size 128 --beam 5 --remove-bpe
+
+```
+
+To train transformer model on IWSLT'14 German to English:
+```
+# Preparation steps are the same as for fconv model.
+
+# Train the model (better for a single GPU setup):
+$ mkdir -p checkpoints/transformer
+$ CUDA_VISIBLE_DEVICES=0 python train.py data-bin/iwslt14.tokenized.de-en \
+  -a transformer_iwslt_de_en --optimizer adam --lr 0.0005 -s de -t en \
+  --label-smoothing 0.1 --dropout 0.3 --max-tokens 4000 \
+  --min-lr '1e-09' --lr-scheduler inverse_sqrt --weight-decay 0.0001 \
+  --criterion label_smoothed_cross_entropy --max-update 50000 \
+  --warmup-updates 4000 --warmup-init-lr '1e-07' \
+  --adam-betas '(0.9, 0.98)' --save-dir checkpoints/transformer
+
+# Average 10 latest checkpoints:
+$ python scripts/average_checkpoints.py --inputs checkpoints/transformer \
+   --num-epoch-checkpoints 10 --output checkpoints/transformer/model.pt
+
+# Generate:
+$ python generate.py data-bin/iwslt14.tokenized.de-en \
+  --path checkpoints/transformer/model.pt \
+  --batch-size 128 --beam 5 --remove-bpe
+
+```
+
+
+### prepare-wmt14en2de.sh
+
+Provides an example of pre-processing for the WMT'14 English to German translation task. By default it will produce a dataset that was modeled after ["Attention Is All You Need" by Vaswani et al.](https://arxiv.org/abs/1706.03762) that includes news-commentary-v12 data.
+
+To use only data available in WMT'14 or to replicate results obtained in the original paper ["Convolutional Sequence to Sequence Learning" by Gehring et al.](https://arxiv.org/abs/1705.03122) run it with --icml17 instead:
+
+```
+$ bash prepare-wmt14en2de.sh --icml17
+```
+
+Example usage:
+
+```
+$ cd examples/translation/
+$ bash prepare-wmt14en2de.sh
+$ cd ../..
+
+# Binarize the dataset:
+$ TEXT=examples/translation/wmt14_en_de
+$ python preprocess.py --source-lang en --target-lang de \
+  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+  --destdir data-bin/wmt14_en_de --thresholdtgt 0 --thresholdsrc 0
+
+# Train the model:
+# If it runs out of memory, try to set --max-tokens 1500 instead
+$ mkdir -p checkpoints/fconv_wmt_en_de
+$ python train.py data-bin/wmt14_en_de \
+  --lr 0.5 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+  --lr-scheduler fixed --force-anneal 50 \
+  --arch fconv_wmt_en_de --save-dir checkpoints/fconv_wmt_en_de
+
+# Generate:
+$ python generate.py data-bin/wmt14_en_de \
+  --path checkpoints/fconv_wmt_en_de/checkpoint_best.pt --beam 5 --remove-bpe
+
+```
+
+### prepare-wmt14en2fr.sh
+
+Provides an example of pre-processing for the WMT'14 English to French translation task.
+
+Example usage:
+
+```
+$ cd examples/translation/
+$ bash prepare-wmt14en2fr.sh
+$ cd ../..
+
+# Binarize the dataset:
+$ TEXT=examples/translation/wmt14_en_fr
+$ python preprocess.py --source-lang en --target-lang fr \
+  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+  --destdir data-bin/wmt14_en_fr --thresholdtgt 0 --thresholdsrc 0
+
+# Train the model:
+# If it runs out of memory, try to set --max-tokens 1000 instead
+$ mkdir -p checkpoints/fconv_wmt_en_fr
+$ python train.py data-bin/wmt14_en_fr \
+  --lr 0.5 --clip-norm 0.1 --dropout 0.1 --max-tokens 3000 \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+  --lr-scheduler fixed --force-anneal 50 \
+  --arch fconv_wmt_en_fr --save-dir checkpoints/fconv_wmt_en_fr
+
+# Generate:
+$ python generate.py data-bin/fconv_wmt_en_fr \
+  --path checkpoints/fconv_wmt_en_fr/checkpoint_best.pt --beam 5 --remove-bpe
+
+```
+
+## Replicating results from "Scaling Neural Machine Translation"
+
+To replicate results from the paper [Scaling Neural Machine Translation (Ott et al., 2018)](https://arxiv.org/abs/1806.00187):
+
+1. Prepare the WMT'14 En-De data with a BPE vocab of 32k:
+```
+$ bash prepare-wmt14en2de.sh --scaling18
+$ cd ../..
+```
+2. Preprocess the dataset with a joined dictionary:
+```
+$ TEXT=examples/translation/wmt14_en_de
+$ python preprocess.py --source-lang en --target-lang de \
+  --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \
+  --destdir data-bin/wmt14_en_de_joined_dict \
+  --nwordssrc 32768 --nwordstgt 32768 \
+  --joined-dictionary
+```
+3. Train a model:
+```
+$ python train.py data-bin/wmt14_en_de_joined_dict \
+  --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+  --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \
+  --lr 0.0005 --min-lr 1e-09 \
+  --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+  --max-tokens 3584 \
+  --fp16
+```
+
+Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU.
+
+If you want to train the above model with big batches (assuming your machine has 8 GPUs):
+- add `--update-freq 16` to simulate training on 8*16=128 GPUs
+- increase the learning rate; 0.001 works well for big batches
--- a/implementations/pytorch/examples/translation/prepare-iwslt14.sh
+++ b/implementations/pytorch/examples/translation/prepare-iwslt14.sh
+#!/usr/bin/env bash
+#
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+LC=$SCRIPTS/tokenizer/lowercase.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+BPEROOT=subword-nmt
+BPE_TOKENS=10000
+
+URL="https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz"
+GZ=de-en.tgz
+
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+
+src=de
+tgt=en
+lang=de-en
+prep=iwslt14.tokenized.de-en
+tmp=$prep/tmp
+orig=orig
+
+mkdir -p $orig $tmp $prep
+
+echo "Downloading data from ${URL}..."
+cd $orig
+wget "$URL"
+
+if [ -f $GZ ]; then
+    echo "Data successfully downloaded."
+else
+    echo "Data not successfully downloaded."
+    exit
+fi
+
+tar zxvf $GZ
+cd ..
+
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    f=train.tags.$lang.$l
+    tok=train.tags.$lang.tok.$l
+
+    cat $orig/$lang/$f | \
+    grep -v '<url>' | \
+    grep -v '<talkid>' | \
+    grep -v '<keywords>' | \
+    sed -e 's/<title>//g' | \
+    sed -e 's/<\/title>//g' | \
+    sed -e 's/<description>//g' | \
+    sed -e 's/<\/description>//g' | \
+    perl $TOKENIZER -threads 8 -l $l > $tmp/$tok
+    echo ""
+done
+perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175
+for l in $src $tgt; do
+    perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l
+done
+
+echo "pre-processing valid/test data..."
+for l in $src $tgt; do
+    for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do
+    fname=${o##*/}
+    f=$tmp/${fname%.*}
+    echo $o $f
+    grep '<seg id' $o | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -l $l | \
+    perl $LC > $f
+    echo ""
+    done
+done
+
+
+echo "creating train, valid, test..."
+for l in $src $tgt; do
+    awk '{if (NR%23 == 0)  print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l
+    awk '{if (NR%23 != 0)  print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l
+
+    cat $tmp/IWSLT14.TED.dev2010.de-en.$l \
+        $tmp/IWSLT14.TEDX.dev2012.de-en.$l \
+        $tmp/IWSLT14.TED.tst2010.de-en.$l \
+        $tmp/IWSLT14.TED.tst2011.de-en.$l \
+        $tmp/IWSLT14.TED.tst2012.de-en.$l \
+        > $tmp/test.$l
+done
+
+TRAIN=$tmp/train.en-de
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f
+    done
+done
--- a/implementations/pytorch/examples/translation/prepare-wmt14en2de.sh
+++ b/implementations/pytorch/examples/translation/prepare-wmt14en2de.sh
+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+BPEROOT=subword-nmt
+BPE_TOKENS=40000
+
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz"
+    "http://data.statmt.org/wmt17/translation-task/dev.tgz"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+FILES=(
+    "training-parallel-europarl-v7.tgz"
+    "training-parallel-commoncrawl.tgz"
+    "training-parallel-nc-v12.tgz"
+    "dev.tgz"
+    "test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.de-en"
+    "commoncrawl.de-en"
+    "training/news-commentary-v12.de-en"
+)
+
+# This will make the dataset compatible to the one used in "Convolutional Sequence to Sequence Learning"
+# https://arxiv.org/abs/1705.03122
+if [ "$1" == "--icml17" ]; then
+    URLS[2]="http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
+    FILES[2]="training-parallel-nc-v9.tgz"
+    CORPORA[2]="training/news-commentary-v9.de-en"
+fi
+
+# This will make the dataset comparable to the one used in "Scaling Neural Machine Translation"
+# https://arxiv.org/abs/1806.00187
+if [ "$1" == "--scaling18" ]; then
+    BPE_TOKENS=33708
+fi
+
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+
+src=en
+tgt=de
+lang=en-de
+prep=wmt14_en_de
+tmp=$prep/tmp
+orig=orig
+dev=dev/newstest2013
+
+mkdir -p $orig $tmp $prep
+
+cd $orig
+
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+cd ..
+
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm -f $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $NORM_PUNC $l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
+    echo ""
+done
+
+if [ "$1" == "--scaling18" ]; then
+    # apply length filtering before BPE for --scaling18
+    perl $CLEAN $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 256
+
+    # use newstest2013 for valid
+    echo "pre-processing valid data..."
+    for l in $src $tgt; do
+        rm -f $tmp/valid.$l
+        cat $orig/$dev.$l | \
+            perl $NORM_PUNC $l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -a -l $l >> $tmp/valid.$l
+    done
+else
+    echo "splitting train and valid..."
+    for l in $src $tgt; do
+        awk '{if (NR%100 == 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
+        awk '{if (NR%100 != 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
+    done
+fi
+
+TRAIN=$tmp/train.de-en
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
+    done
+done
+
+if [ "$1" == "--scaling18" ]; then
+    for L in $src $tgt; do
+        cp $tmp/bpe.train.$L $prep/train.$L
+        cp $tmp/bpe.valid.$L $prep/valid.$L
+    done
+else
+    perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
+    perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
+fi
+
+for L in $src $tgt; do
+    cp $tmp/bpe.test.$L $prep/test.$L
+done
--- a/implementations/pytorch/examples/translation/prepare-wmt14en2fr.sh
+++ b/implementations/pytorch/examples/translation/prepare-wmt14en2fr.sh
+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+
+echo 'Cloning Subword NMT repository (for BPE pre-processing)...'
+git clone https://github.com/rsennrich/subword-nmt.git
+
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+BPEROOT=subword-nmt
+BPE_TOKENS=40000
+
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://statmt.org/wmt13/training-parallel-un.tgz"
+    "http://statmt.org/wmt14/training-parallel-nc-v9.tgz"
+    "http://statmt.org/wmt10/training-giga-fren.tar"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+FILES=(
+    "training-parallel-europarl-v7.tgz"
+    "training-parallel-commoncrawl.tgz"
+    "training-parallel-un.tgz"
+    "training-parallel-nc-v9.tgz"
+    "training-giga-fren.tar"
+    "test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.fr-en"
+    "commoncrawl.fr-en"
+    "un/undoc.2000.fr-en"
+    "training/news-commentary-v9.fr-en"
+    "giga-fren.release2.fixed"
+)
+
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+
+src=en
+tgt=fr
+lang=en-fr
+prep=wmt14_en_fr
+tmp=$prep/tmp
+orig=orig
+
+mkdir -p $orig $tmp $prep
+
+cd $orig
+
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+
+gunzip giga-fren.release2.fixed.*.gz
+cd ..
+
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $NORM_PUNC $l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-fren-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l
+    echo ""
+done
+
+echo "splitting train and valid..."
+for l in $src $tgt; do
+    awk '{if (NR%1333 == 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l
+    awk '{if (NR%1333 != 0)  print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l
+done
+
+TRAIN=$tmp/train.fr-en
+BPE_CODE=$prep/code
+rm -f $TRAIN
+for l in $src $tgt; do
+    cat $tmp/train.$l >> $TRAIN
+done
+
+echo "learn_bpe.py on ${TRAIN}..."
+python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE
+
+for L in $src $tgt; do
+    for f in train.$L valid.$L test.$L; do
+        echo "apply_bpe.py to ${f}..."
+        python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f
+    done
+done
+
+perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250
+perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250
+
+for L in $src $tgt; do
+    cp $tmp/bpe.test.$L $prep/test.$L
+done
--- a/implementations/pytorch/fairseq.gif
+++ b/implementations/pytorch/fairseq.gif
--- a/implementations/pytorch/fairseq/__init__.py
+++ b/implementations/pytorch/fairseq/__init__.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+from .multiprocessing_pdb import pdb
+
+__all__ = ['pdb']
--- a/implementations/pytorch/fairseq/__pycache__/__init__.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/__init__.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/distributed_utils.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/distributed_utils.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/fp16_trainer.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/fp16_trainer.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/meters.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/meters.cpython-310.pyc
--- a/implementations/pytorch/fairseq/__pycache__/multiprocessing_pdb.cpython-310.pyc
+++ b/implementations/pytorch/fairseq/__pycache__/multiprocessing_pdb.cpython-310.pyc