Reorganize examples (#9010)

* Reorganize example folder * Continue reorganization * Change requirements for tests * Final cleanup * Finish regroup with tests all passing * Copyright * Requirements and readme * Make a full link for the documentation * Address review comments * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add symlink * Reorg again * Apply suggestions from code review Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> * Adapt title * Update to new strucutre * Remove test * Update READMEs Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>

Reorganize examples (#9010)
* Reorganize example folder * Continue reorganization * Change requirements for tests * Final cleanup * Finish regroup with tests all passing * Copyright * Requirements and readme * Make a full link for the documentation * Address review comments * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add symlink * Reorg again * Apply suggestions from code review Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> * Adapt title * Update to new strucutre * Remove test * Update READMEs Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
783d7d26 · Sylvain Gugger · GitHub · 86896de0 · 783d7d26 · 783d7d26
Unverified Commit 783d7d26 authored Dec 11, 2020 by Sylvain Gugger Committed by GitHub Dec 11, 2020
20 changed files
--- a/examples/research_projects/seq2seq-distillation/sentence_splitter.py
+++ b/examples/research_projects/seq2seq-distillation/sentence_splitter.py
+import re
+
+from filelock import FileLock
+
+
+try:
+    import nltk
+
+    NLTK_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    NLTK_AVAILABLE = False
+
+if NLTK_AVAILABLE:
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+
+
+def add_newline_to_end_of_each_sentence(x: str) -> str:
+    """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
+    re.sub("<n>", "", x)  # remove pegasus newline char
+    assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
+    return "\n".join(nltk.sent_tokenize(x))
--- a/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh
+++ b/examples/research_projects/seq2seq-distillation/train_distilbart_cnn.sh
+#!/usr/bin/env bash
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+export BS=32
+export GAS=1
+
+python finetune.py \
+    --learning_rate=3e-5 \
+    --fp16 \
+    --gpus 1 \
+    --do_train \
+    --do_predict \
+    --val_check_interval 0.25 \
+    --n_val 500 \
+    --num_train_epochs 2 \
+    --freeze_encoder --freeze_embeds --data_dir cnn_dm \
+    --max_target_length 142 --val_max_target_length=142 \
+    --train_batch_size=$BS --eval_batch_size=$BS --gradient_accumulation_steps=$GAS \
+    --model_name_or_path sshleifer/student_cnn_12_6 \
+    --tokenizer_name facebook/bart-large \
+    --warmup_steps 500 \
+    --output_dir distilbart-cnn-12-6 \
+    "$@"
+
--- a/examples/seq2seq/train_distilbart_xsum.sh
+++ b/examples/seq2seq/train_distilbart_xsum.sh
--- a/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh
+++ b/examples/research_projects/seq2seq-distillation/train_mbart_cc25_enro.sh
+#!/usr/bin/env bash
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+python finetune.py \
+    --learning_rate=3e-5 \
+    --fp16 \
+    --do_train \
+    --val_check_interval=0.25 \
+    --adam_eps 1e-06 \
+    --num_train_epochs 6 --src_lang en_XX --tgt_lang ro_RO \
+    --data_dir $ENRO_DIR \
+    --max_source_length $MAX_LEN --max_target_length $MAX_LEN --val_max_target_length $MAX_LEN --test_max_target_length $MAX_LEN \
+    --train_batch_size=$BS --eval_batch_size=$BS \
+    --task translation \
+    --warmup_steps 500 \
+    --freeze_embeds \
+    --model_name_or_path=facebook/mbart-large-cc25 \
+    "$@"
--- a/examples/research_projects/seq2seq-distillation/utils copy.py
+++ b/examples/research_projects/seq2seq-distillation/utils copy.py
+import itertools
+import json
+import linecache
+import math
+import os
+import pickle
+import socket
+from logging import getLogger
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Tuple, Union
+
+import git
+import numpy as np
+import torch
+import torch.distributed as dist
+from rouge_score import rouge_scorer, scoring
+from sacrebleu import corpus_bleu
+from torch import nn
+from torch.utils.data import Dataset, Sampler
+
+from sentence_splitter import add_newline_to_end_of_each_sentence
+from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
+from transformers.file_utils import cached_property
+from transformers.models.bart.modeling_bart import shift_tokens_right
+
+
+try:
+    from fairseq.data.data_utils import batch_by_size
+
+    FAIRSEQ_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    FAIRSEQ_AVAILABLE = False
+
+
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
+    """From fairseq"""
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.0)
+        smooth_loss.masked_fill_(pad_mask, 0.0)
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+
+    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
+    smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / lprobs.size(-1)
+    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+
+
+def lmap(f: Callable, x: Iterable) -> List:
+    """list(map(f, x))"""
+    return list(map(f, x))
+
+
+def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
+    """Uses sacrebleu's corpus_bleu implementation."""
+    return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
+
+
+def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
+    def non_pad_len(tokens: np.ndarray) -> int:
+        return np.count_nonzero(tokens != tokenizer.pad_token_id)
+
+    def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
+        pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
+        label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
+        pred_str = lmap(str.strip, pred_str)
+        label_str = lmap(str.strip, label_str)
+        return pred_str, label_str
+
+    def summarization_metrics(pred: EvalPrediction) -> Dict:
+        pred_str, label_str = decode_pred(pred)
+        rouge: Dict = calculate_rouge(pred_str, label_str)
+        summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
+        rouge.update({"gen_len": summ_len})
+        return rouge
+
+    def translation_metrics(pred: EvalPrediction) -> Dict:
+        pred_str, label_str = decode_pred(pred)
+        bleu: Dict = calculate_bleu(pred_str, label_str)
+        gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
+        bleu.update({"gen_len": gen_len})
+        return bleu
+
+    compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics
+    return compute_metrics_fn
+
+
+def trim_batch(
+    input_ids,
+    pad_token_id,
+    attention_mask=None,
+):
+    """Remove columns that are populated exclusively by pad_token_id"""
+    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
+    if attention_mask is None:
+        return input_ids[:, keep_column_mask]
+    else:
+        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
+
+
+class AbstractSeq2SeqDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer,
+        data_dir,
+        max_source_length,
+        max_target_length,
+        type_path="train",
+        n_obs=None,
+        prefix="",
+        **dataset_kwargs
+    ):
+        super().__init__()
+        self.src_file = Path(data_dir).joinpath(type_path + ".source")
+        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
+        self.len_file = Path(data_dir).joinpath(type_path + ".len")
+        if os.path.exists(self.len_file):
+            self.src_lens = pickle_load(self.len_file)
+            self.used_char_len = False
+        else:
+            self.src_lens = self.get_char_lens(self.src_file)
+            self.used_char_len = True
+        self.max_source_length = max_source_length
+        self.max_target_length = max_target_length
+        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
+        self.tokenizer = tokenizer
+        self.prefix = prefix if prefix is not None else ""
+
+        if n_obs is not None:
+            self.src_lens = self.src_lens[:n_obs]
+        self.pad_token_id = self.tokenizer.pad_token_id
+        self.dataset_kwargs = dataset_kwargs
+        dataset_kwargs.update({"add_prefix_space": True} if isinstance(self.tokenizer, BartTokenizer) else {})
+
+    def __len__(self):
+        return len(self.src_lens)
+
+    @staticmethod
+    def get_char_lens(data_file):
+        return [len(x) for x in Path(data_file).open().readlines()]
+
+    @cached_property
+    def tgt_lens(self):
+        """Length in characters of target documents"""
+        return self.get_char_lens(self.tgt_file)
+
+    def make_sortish_sampler(self, batch_size, distributed=False, shuffle=True, **kwargs):
+        if distributed:
+            return DistributedSortishSampler(self, batch_size, shuffle=shuffle, **kwargs)
+        else:
+            return SortishSampler(self.src_lens, batch_size, shuffle=shuffle)
+
+    def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs):
+        assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`"
+        assert not self.used_char_len, "You must call  python make_len_file.py before calling make_dynamic_sampler"
+        sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False))
+
+        def num_tokens_in_example(i):
+            return min(self.src_lens[i], self.max_target_length)
+
+        # call fairseq cython function
+        batch_sampler: List[List[int]] = batch_by_size(
+            sorted_indices,
+            num_tokens_fn=num_tokens_in_example,
+            max_tokens=max_tokens_per_batch,
+            required_batch_size_multiple=64,
+        )
+        shuffled_batches = [batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler)))]
+        # move the largest batch to the front to OOM quickly (uses an approximation for padding)
+        approximate_toks_per_batch = [max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches]
+        largest_batch_idx = np.argmax(approximate_toks_per_batch)
+        shuffled_batches[0], shuffled_batches[largest_batch_idx] = (
+            shuffled_batches[largest_batch_idx],
+            shuffled_batches[0],
+        )
+        return shuffled_batches
+
+    def __getitem__(self, item):
+        raise NotImplementedError("You must implement this")
+
+    def collate_fn(self, batch):
+        raise NotImplementedError("You must implement this")
+
+
+class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
+    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
+        """Call tokenizer on src and tgt_lines"""
+        index = index + 1  # linecache starts at 1
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
+        assert source_line, f"empty source line for index {index}"
+        assert tgt_line, f"empty tgt line for index {index}"
+        source_inputs = self.encode_line(self.tokenizer, source_line, self.max_source_length)
+        target_inputs = self.encode_line(self.tokenizer, tgt_line, self.max_target_length)
+
+        source_ids = source_inputs["input_ids"].squeeze()
+        target_ids = target_inputs["input_ids"].squeeze()
+        src_mask = source_inputs["attention_mask"].squeeze()
+        return {
+            "input_ids": source_ids,
+            "attention_mask": src_mask,
+            "labels": target_ids,
+        }
+
+    def encode_line(self, tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
+        """Only used by LegacyDataset"""
+        return tokenizer(
+            [line],
+            max_length=max_length,
+            padding="max_length" if pad_to_max_length else None,
+            truncation=True,
+            return_tensors=return_tensors,
+            **self.dataset_kwargs,
+        )
+
+    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
+        input_ids = torch.stack([x["input_ids"] for x in batch])
+        masks = torch.stack([x["attention_mask"] for x in batch])
+        target_ids = torch.stack([x["labels"] for x in batch])
+        pad_token_id = self.pad_token_id
+        y = trim_batch(target_ids, pad_token_id)
+        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
+        batch = {
+            "input_ids": source_ids,
+            "attention_mask": source_mask,
+            "labels": y,
+        }
+        return batch
+
+
+class Seq2SeqDataset(AbstractSeq2SeqDataset):
+    """A dataset that calls prepare_seq2seq_batch."""
+
+    def __getitem__(self, index) -> Dict[str, str]:
+        index = index + 1  # linecache starts at 1
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
+        assert source_line, f"empty source line for index {index}"
+        assert tgt_line, f"empty tgt line for index {index}"
+        return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
+
+    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
+        """Call prepare_seq2seq_batch."""
+        batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
+            [x["src_texts"] for x in batch],
+            tgt_texts=[x["tgt_texts"] for x in batch],
+            max_length=self.max_source_length,
+            max_target_length=self.max_target_length,
+            return_tensors="pt",
+            **self.dataset_kwargs,
+        ).data
+        batch_encoding["ids"] = torch.tensor([x["id"] for x in batch])
+        return batch_encoding
+
+
+class Seq2SeqDataCollator:
+    def __init__(self, tokenizer, data_args, tpu_num_cores=None):
+        self.tokenizer = tokenizer
+        self.pad_token_id = tokenizer.pad_token_id
+        assert (
+            self.pad_token_id is not None
+        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        self.data_args = data_args
+        self.tpu_num_cores = tpu_num_cores
+        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
+        if data_args.src_lang is not None:
+            self.dataset_kwargs["src_lang"] = data_args.src_lang
+        if data_args.tgt_lang is not None:
+            self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
+
+    def __call__(self, batch) -> Dict[str, torch.Tensor]:
+        if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
+            batch = self._encode(batch)
+            input_ids, attention_mask, labels = (
+                batch["input_ids"],
+                batch["attention_mask"],
+                batch["labels"],
+            )
+        else:
+            input_ids = torch.stack([x["input_ids"] for x in batch])
+            attention_mask = torch.stack([x["attention_mask"] for x in batch])
+            labels = torch.stack([x["labels"] for x in batch])
+
+            labels = trim_batch(labels, self.pad_token_id)
+            input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
+
+        if isinstance(self.tokenizer, T5Tokenizer):
+            decoder_input_ids = self._shift_right_t5(labels)
+        else:
+            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id)
+
+        batch = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "labels": labels,
+        }
+        return batch
+
+    def _shift_right_t5(self, input_ids):
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = self.pad_token_id
+        return shifted_input_ids
+
+    def _encode(self, batch) -> Dict[str, torch.Tensor]:
+        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
+            [x["src_texts"] for x in batch],
+            tgt_texts=[x["tgt_texts"] for x in batch],
+            max_length=self.data_args.max_source_length,
+            max_target_length=self.data_args.max_target_length,
+            padding="max_length" if self.tpu_num_cores is not None else "longest",  # TPU hack
+            return_tensors="pt",
+            **self.dataset_kwargs,
+        )
+        return batch_encoding.data
+
+
+class SortishSampler(Sampler):
+    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
+
+    def __init__(self, data, batch_size, shuffle=True):
+        self.data, self.bs, self.shuffle = data, batch_size, shuffle
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
+
+
+def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
+    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
+    if not shuffle:
+        return np.argsort(np.array(data) * -1)
+
+    def key_fn(i):
+        return data[i]
+
+    idxs = np.random.permutation(len(data))
+    sz = bs * 50
+    ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
+    sort_idx = np.concatenate([sorted(s, key=key_fn, reverse=True) for s in ck_idx])
+    sz = bs
+    ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
+    max_ck = np.argmax([key_fn(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
+    ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
+    sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=np.int)
+    sort_idx = np.concatenate((ck_idx[0], sort_idx))
+    return sort_idx
+
+
+class DistributedSortishSampler(Sampler):
+    """Copied from torch DistributedSampler"""
+
+    def __init__(self, dataset, batch_size, num_replicas=None, rank=None, add_extra_examples=True, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        if add_extra_examples:
+            self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+            self.total_size = self.num_samples * self.num_replicas
+        else:
+            self.total_size = len(dataset)
+            self.num_samples = len(self.available_indices)
+        self.batch_size = batch_size
+        self.add_extra_examples = add_extra_examples
+        self.shuffle = shuffle
+
+    def __iter__(self) -> Iterable:
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        sortish_data = [self.dataset.src_lens[i] for i in self.available_indices]
+        sortish_indices = sortish_sampler_indices(sortish_data, self.batch_size, shuffle=self.shuffle)
+        indices = [self.available_indices[i] for i in sortish_indices]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    @cached_property
+    def available_indices(self) -> np.array:
+        indices = list(range(len(self.dataset)))
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        # subsample
+        available_indices = indices[self.rank : self.total_size : self.num_replicas]
+        return available_indices
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+logger = getLogger(__name__)
+
+
+def use_task_specific_params(model, task):
+    """Update config with summarization specific params."""
+    task_specific_params = model.config.task_specific_params
+
+    if task_specific_params is not None:
+        pars = task_specific_params.get(task, {})
+        logger.info(f"using task specific params for {task}: {pars}")
+        model.config.update(pars)
+
+
+def pickle_load(path):
+    """pickle.load(path)"""
+    with open(path, "rb") as f:
+        return pickle.load(f)
+
+
+def pickle_save(obj, path):
+    """pickle.dump(obj, path)"""
+    with open(path, "wb") as f:
+        return pickle.dump(obj, f)
+
+
+def flatten_list(summary_ids: List[List]):
+    return [x for x in itertools.chain.from_iterable(summary_ids)]
+
+
+def save_git_info(folder_path: str) -> None:
+    """Save git information to output_dir/git_log.json"""
+    repo_infos = get_git_info()
+    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
+
+
+def save_json(content, path, indent=4, **json_dump_kwargs):
+    with open(path, "w") as f:
+        json.dump(content, f, indent=indent, **json_dump_kwargs)
+
+
+def load_json(path):
+    with open(path) as f:
+        return json.load(f)
+
+
+def get_git_info():
+    try:
+        repo = git.Repo(search_parent_directories=True)
+        repo_infos = {
+            "repo_id": str(repo),
+            "repo_sha": str(repo.head.object.hexsha),
+            "repo_branch": str(repo.active_branch),
+            "hostname": str(socket.gethostname()),
+        }
+        return repo_infos
+    except TypeError:
+        return {
+            "repo_id": None,
+            "repo_sha": None,
+            "repo_branch": None,
+            "hostname": None,
+        }
+
+
+ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+
+
+def extract_rouge_mid_statistics(dct):
+    new_dict = {}
+    for k1, v1 in dct.items():
+        mid = v1.mid
+        new_dict[k1] = {stat: round(getattr(mid, stat), 4) for stat in ["precision", "recall", "fmeasure"]}
+    return new_dict
+
+
+def calculate_rouge(
+    pred_lns: List[str],
+    tgt_lns: List[str],
+    use_stemmer=True,
+    rouge_keys=ROUGE_KEYS,
+    return_precision_and_recall=False,
+    bootstrap_aggregation=True,
+    newline_sep=True,
+) -> Dict:
+    """Calculate rouge using rouge_scorer package.
+
+    Args:
+        pred_lns: list of summaries generated by model
+        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
+        use_stemmer:  Bool indicating whether Porter stemmer should be used to
+        strip word suffixes to improve matching.
+        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
+        return_precision_and_recall: (False) whether to also return precision and recall.
+        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
+            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
+        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
+        on multi sentence summaries (CNN/DM dataset).
+
+    Returns:
+         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
+
+    """
+    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
+    aggregator = scoring.BootstrapAggregator()
+    for pred, tgt in zip(tgt_lns, pred_lns):
+        # rougeLsum expects "\n" separated sentences within a summary
+        if newline_sep:
+            pred = add_newline_to_end_of_each_sentence(pred)
+            tgt = add_newline_to_end_of_each_sentence(tgt)
+        scores = scorer.score(pred, tgt)
+        aggregator.add_scores(scores)
+
+    if bootstrap_aggregation:
+        result = aggregator.aggregate()
+        if return_precision_and_recall:
+            return extract_rouge_mid_statistics(result)  # here we return dict
+        else:
+            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
+
+    else:
+        return aggregator._scores  # here we return defaultdict(list)
+
+
+# Utilities for freezing parameters and checking whether they are frozen
+
+
+def freeze_params(model: nn.Module):
+    """Set requires_grad=False for each of model.parameters()"""
+    for par in model.parameters():
+        par.requires_grad = False
+
+
+def freeze_embeds(model):
+    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
+    model_type = model.config.model_type
+
+    if model_type == "t5":
+        freeze_params(model.shared)
+        for d in [model.encoder, model.decoder]:
+            freeze_params(d.embed_tokens)
+    elif model_type == "fsmt":
+        for d in [model.model.encoder, model.model.decoder]:
+            freeze_params(d.embed_positions)
+            freeze_params(d.embed_tokens)
+    else:
+        freeze_params(model.model.shared)
+        for d in [model.model.encoder, model.model.decoder]:
+            freeze_params(d.embed_positions)
+            freeze_params(d.embed_tokens)
+
+
+def grad_status(model: nn.Module) -> Iterable:
+    return (par.requires_grad for par in model.parameters())
+
+
+def any_requires_grad(model: nn.Module) -> bool:
+    return any(grad_status(model))
+
+
+def assert_all_frozen(model):
+    model_grads: List[bool] = list(grad_status(model))
+    n_require_grad = sum(lmap(int, model_grads))
+    npars = len(model_grads)
+    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+
+
+def assert_not_all_frozen(model):
+    model_grads: List[bool] = list(grad_status(model))
+    npars = len(model_grads)
+    assert any(model_grads), f"none of {npars} weights require grad"
+
+
+def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
+    """
+    Parse an argv list of unspecified command line args to a dict.
+    Assumes all values are either numeric or boolean in the form of true/false.
+    """
+    result = {}
+    assert len(unparsed_args) % 2 == 0, f"got odd number of unparsed args: {unparsed_args}"
+    num_pairs = len(unparsed_args) // 2
+    for pair_num in range(num_pairs):
+        i = 2 * pair_num
+        assert unparsed_args[i].startswith("--")
+        if unparsed_args[i + 1].lower() == "true":
+            value = True
+        elif unparsed_args[i + 1].lower() == "false":
+            value = False
+        else:
+            try:
+                value = int(unparsed_args[i + 1])
+            except ValueError:
+                value = float(unparsed_args[i + 1])  # this can raise another informative ValueError
+
+        result[unparsed_args[i][2:]] = value
+    return result
+
+
+def write_txt_file(ordered_tgt, path):
+    f = Path(path).open("w")
+    for ln in ordered_tgt:
+        f.write(ln + "\n")
+        f.flush()
+
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def check_output_dir(args, expected_items=0):
+    """
+    Checks whether to bail out if output_dir already exists and has more than expected_items in it
+
+    `args`: needs to have the following attributes of `args`:
+      - output_dir
+      - do_train
+      - overwrite_output_dir
+
+    `expected_items`: normally 0 (default) - i.e. empty dir, but in some cases a few files are expected (e.g. recovery from OOM)
+    """
+    if (
+        os.path.exists(args.output_dir)
+        and len(os.listdir(args.output_dir)) > expected_items
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({args.output_dir}) already exists and "
+            f"has {len(os.listdir(args.output_dir))} items in it (expected {expected_items} items). "
+            "Use --overwrite_output_dir to overcome."
+        )
--- a/examples/research_projects/seq2seq-distillation/utils.py
+++ b/examples/research_projects/seq2seq-distillation/utils.py
+import itertools
+import json
+import linecache
+import math
+import os
+import pickle
+import socket
+from logging import getLogger
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Tuple, Union
+
+import git
+import numpy as np
+import torch
+import torch.distributed as dist
+from rouge_score import rouge_scorer, scoring
+from sacrebleu import corpus_bleu
+from torch import nn
+from torch.utils.data import Dataset, Sampler
+
+from sentence_splitter import add_newline_to_end_of_each_sentence
+from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
+from transformers.file_utils import cached_property
+from transformers.models.bart.modeling_bart import shift_tokens_right
+
+
+try:
+    from fairseq.data.data_utils import batch_by_size
+
+    FAIRSEQ_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    FAIRSEQ_AVAILABLE = False
+
+
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
+    """From fairseq"""
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.0)
+        smooth_loss.masked_fill_(pad_mask, 0.0)
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+
+    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
+    smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / lprobs.size(-1)
+    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+
+
+def lmap(f: Callable, x: Iterable) -> List:
+    """list(map(f, x))"""
+    return list(map(f, x))
+
+
+def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
+    """Uses sacrebleu's corpus_bleu implementation."""
+    return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
+
+
+def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
+    def non_pad_len(tokens: np.ndarray) -> int:
+        return np.count_nonzero(tokens != tokenizer.pad_token_id)
+
+    def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
+        pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
+        label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
+        pred_str = lmap(str.strip, pred_str)
+        label_str = lmap(str.strip, label_str)
+        return pred_str, label_str
+
+    def summarization_metrics(pred: EvalPrediction) -> Dict:
+        pred_str, label_str = decode_pred(pred)
+        rouge: Dict = calculate_rouge(pred_str, label_str)
+        summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
+        rouge.update({"gen_len": summ_len})
+        return rouge
+
+    def translation_metrics(pred: EvalPrediction) -> Dict:
+        pred_str, label_str = decode_pred(pred)
+        bleu: Dict = calculate_bleu(pred_str, label_str)
+        gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
+        bleu.update({"gen_len": gen_len})
+        return bleu
+
+    compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics
+    return compute_metrics_fn
+
+
+def trim_batch(
+    input_ids,
+    pad_token_id,
+    attention_mask=None,
+):
+    """Remove columns that are populated exclusively by pad_token_id"""
+    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
+    if attention_mask is None:
+        return input_ids[:, keep_column_mask]
+    else:
+        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
+
+
+class AbstractSeq2SeqDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer,
+        data_dir,
+        max_source_length,
+        max_target_length,
+        type_path="train",
+        n_obs=None,
+        prefix="",
+        **dataset_kwargs
+    ):
+        super().__init__()
+        self.src_file = Path(data_dir).joinpath(type_path + ".source")
+        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
+        self.len_file = Path(data_dir).joinpath(type_path + ".len")
+        if os.path.exists(self.len_file):
+            self.src_lens = pickle_load(self.len_file)
+            self.used_char_len = False
+        else:
+            self.src_lens = self.get_char_lens(self.src_file)
+            self.used_char_len = True
+        self.max_source_length = max_source_length
+        self.max_target_length = max_target_length
+        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
+        self.tokenizer = tokenizer
+        self.prefix = prefix if prefix is not None else ""
+
+        if n_obs is not None:
+            self.src_lens = self.src_lens[:n_obs]
+        self.pad_token_id = self.tokenizer.pad_token_id
+        self.dataset_kwargs = dataset_kwargs
+        dataset_kwargs.update({"add_prefix_space": True} if isinstance(self.tokenizer, BartTokenizer) else {})
+
+    def __len__(self):
+        return len(self.src_lens)
+
+    @staticmethod
+    def get_char_lens(data_file):
+        return [len(x) for x in Path(data_file).open().readlines()]
+
+    @cached_property
+    def tgt_lens(self):
+        """Length in characters of target documents"""
+        return self.get_char_lens(self.tgt_file)
+
+    def make_sortish_sampler(self, batch_size, distributed=False, shuffle=True, **kwargs):
+        if distributed:
+            return DistributedSortishSampler(self, batch_size, shuffle=shuffle, **kwargs)
+        else:
+            return SortishSampler(self.src_lens, batch_size, shuffle=shuffle)
+
+    def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs):
+        assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`"
+        assert not self.used_char_len, "You must call  python make_len_file.py before calling make_dynamic_sampler"
+        sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False))
+
+        def num_tokens_in_example(i):
+            return min(self.src_lens[i], self.max_target_length)
+
+        # call fairseq cython function
+        batch_sampler: List[List[int]] = batch_by_size(
+            sorted_indices,
+            num_tokens_fn=num_tokens_in_example,
+            max_tokens=max_tokens_per_batch,
+            required_batch_size_multiple=64,
+        )
+        shuffled_batches = [batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler)))]
+        # move the largest batch to the front to OOM quickly (uses an approximation for padding)
+        approximate_toks_per_batch = [max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches]
+        largest_batch_idx = np.argmax(approximate_toks_per_batch)
+        shuffled_batches[0], shuffled_batches[largest_batch_idx] = (
+            shuffled_batches[largest_batch_idx],
+            shuffled_batches[0],
+        )
+        return shuffled_batches
+
+    def __getitem__(self, item):
+        raise NotImplementedError("You must implement this")
+
+    def collate_fn(self, batch):
+        raise NotImplementedError("You must implement this")
+
+
+class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
+    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
+        """Call tokenizer on src and tgt_lines"""
+        index = index + 1  # linecache starts at 1
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
+        assert source_line, f"empty source line for index {index}"
+        assert tgt_line, f"empty tgt line for index {index}"
+        source_inputs = self.encode_line(self.tokenizer, source_line, self.max_source_length)
+        target_inputs = self.encode_line(self.tokenizer, tgt_line, self.max_target_length)
+
+        source_ids = source_inputs["input_ids"].squeeze()
+        target_ids = target_inputs["input_ids"].squeeze()
+        src_mask = source_inputs["attention_mask"].squeeze()
+        return {
+            "input_ids": source_ids,
+            "attention_mask": src_mask,
+            "labels": target_ids,
+        }
+
+    def encode_line(self, tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
+        """Only used by LegacyDataset"""
+        return tokenizer(
+            [line],
+            max_length=max_length,
+            padding="max_length" if pad_to_max_length else None,
+            truncation=True,
+            return_tensors=return_tensors,
+            **self.dataset_kwargs,
+        )
+
+    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
+        input_ids = torch.stack([x["input_ids"] for x in batch])
+        masks = torch.stack([x["attention_mask"] for x in batch])
+        target_ids = torch.stack([x["labels"] for x in batch])
+        pad_token_id = self.pad_token_id
+        y = trim_batch(target_ids, pad_token_id)
+        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
+        batch = {
+            "input_ids": source_ids,
+            "attention_mask": source_mask,
+            "labels": y,
+        }
+        return batch
+
+
+class Seq2SeqDataset(AbstractSeq2SeqDataset):
+    """A dataset that calls prepare_seq2seq_batch."""
+
+    def __getitem__(self, index) -> Dict[str, str]:
+        index = index + 1  # linecache starts at 1
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
+        assert source_line, f"empty source line for index {index}"
+        assert tgt_line, f"empty tgt line for index {index}"
+        return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
+
+    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
+        """Call prepare_seq2seq_batch."""
+        batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
+            [x["src_texts"] for x in batch],
+            tgt_texts=[x["tgt_texts"] for x in batch],
+            max_length=self.max_source_length,
+            max_target_length=self.max_target_length,
+            return_tensors="pt",
+            **self.dataset_kwargs,
+        ).data
+        batch_encoding["ids"] = torch.tensor([x["id"] for x in batch])
+        return batch_encoding
+
+
+class Seq2SeqDataCollator:
+    def __init__(self, tokenizer, data_args, tpu_num_cores=None):
+        self.tokenizer = tokenizer
+        self.pad_token_id = tokenizer.pad_token_id
+        assert (
+            self.pad_token_id is not None
+        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        self.data_args = data_args
+        self.tpu_num_cores = tpu_num_cores
+        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
+        if data_args.src_lang is not None:
+            self.dataset_kwargs["src_lang"] = data_args.src_lang
+        if data_args.tgt_lang is not None:
+            self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
+
+    def __call__(self, batch) -> Dict[str, torch.Tensor]:
+        if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
+            batch = self._encode(batch)
+            input_ids, attention_mask, labels = (
+                batch["input_ids"],
+                batch["attention_mask"],
+                batch["labels"],
+            )
+        else:
+            input_ids = torch.stack([x["input_ids"] for x in batch])
+            attention_mask = torch.stack([x["attention_mask"] for x in batch])
+            labels = torch.stack([x["labels"] for x in batch])
+
+            labels = trim_batch(labels, self.pad_token_id)
+            input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
+
+        if isinstance(self.tokenizer, T5Tokenizer):
+            decoder_input_ids = self._shift_right_t5(labels)
+        else:
+            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id)
+
+        batch = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "labels": labels,
+        }
+        return batch
+
+    def _shift_right_t5(self, input_ids):
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = self.pad_token_id
+        return shifted_input_ids
+
+    def _encode(self, batch) -> Dict[str, torch.Tensor]:
+        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
+            [x["src_texts"] for x in batch],
+            tgt_texts=[x["tgt_texts"] for x in batch],
+            max_length=self.data_args.max_source_length,
+            max_target_length=self.data_args.max_target_length,
+            padding="max_length" if self.tpu_num_cores is not None else "longest",  # TPU hack
+            return_tensors="pt",
+            **self.dataset_kwargs,
+        )
+        return batch_encoding.data
+
+
+class SortishSampler(Sampler):
+    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
+
+    def __init__(self, data, batch_size, shuffle=True):
+        self.data, self.bs, self.shuffle = data, batch_size, shuffle
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __iter__(self):
+        return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
+
+
+def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
+    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
+    if not shuffle:
+        return np.argsort(np.array(data) * -1)
+
+    def key_fn(i):
+        return data[i]
+
+    idxs = np.random.permutation(len(data))
+    sz = bs * 50
+    ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
+    sort_idx = np.concatenate([sorted(s, key=key_fn, reverse=True) for s in ck_idx])
+    sz = bs
+    ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
+    max_ck = np.argmax([key_fn(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
+    ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
+    sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=np.int)
+    sort_idx = np.concatenate((ck_idx[0], sort_idx))
+    return sort_idx
+
+
+class DistributedSortishSampler(Sampler):
+    """Copied from torch DistributedSampler"""
+
+    def __init__(self, dataset, batch_size, num_replicas=None, rank=None, add_extra_examples=True, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        if add_extra_examples:
+            self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+            self.total_size = self.num_samples * self.num_replicas
+        else:
+            self.total_size = len(dataset)
+            self.num_samples = len(self.available_indices)
+        self.batch_size = batch_size
+        self.add_extra_examples = add_extra_examples
+        self.shuffle = shuffle
+
+    def __iter__(self) -> Iterable:
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        sortish_data = [self.dataset.src_lens[i] for i in self.available_indices]
+        sortish_indices = sortish_sampler_indices(sortish_data, self.batch_size, shuffle=self.shuffle)
+        indices = [self.available_indices[i] for i in sortish_indices]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    @cached_property
+    def available_indices(self) -> np.array:
+        indices = list(range(len(self.dataset)))
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        # subsample
+        available_indices = indices[self.rank : self.total_size : self.num_replicas]
+        return available_indices
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+logger = getLogger(__name__)
+
+
+def use_task_specific_params(model, task):
+    """Update config with summarization specific params."""
+    task_specific_params = model.config.task_specific_params
+
+    if task_specific_params is not None:
+        pars = task_specific_params.get(task, {})
+        logger.info(f"using task specific params for {task}: {pars}")
+        model.config.update(pars)
+
+
+def pickle_load(path):
+    """pickle.load(path)"""
+    with open(path, "rb") as f:
+        return pickle.load(f)
+
+
+def pickle_save(obj, path):
+    """pickle.dump(obj, path)"""
+    with open(path, "wb") as f:
+        return pickle.dump(obj, f)
+
+
+def flatten_list(summary_ids: List[List]):
+    return [x for x in itertools.chain.from_iterable(summary_ids)]
+
+
+def save_git_info(folder_path: str) -> None:
+    """Save git information to output_dir/git_log.json"""
+    repo_infos = get_git_info()
+    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
+
+
+def save_json(content, path, indent=4, **json_dump_kwargs):
+    with open(path, "w") as f:
+        json.dump(content, f, indent=indent, **json_dump_kwargs)
+
+
+def load_json(path):
+    with open(path) as f:
+        return json.load(f)
+
+
+def get_git_info():
+    try:
+        repo = git.Repo(search_parent_directories=True)
+        repo_infos = {
+            "repo_id": str(repo),
+            "repo_sha": str(repo.head.object.hexsha),
+            "repo_branch": str(repo.active_branch),
+            "hostname": str(socket.gethostname()),
+        }
+        return repo_infos
+    except TypeError:
+        return {
+            "repo_id": None,
+            "repo_sha": None,
+            "repo_branch": None,
+            "hostname": None,
+        }
+
+
+ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+
+
+def extract_rouge_mid_statistics(dct):
+    new_dict = {}
+    for k1, v1 in dct.items():
+        mid = v1.mid
+        new_dict[k1] = {stat: round(getattr(mid, stat), 4) for stat in ["precision", "recall", "fmeasure"]}
+    return new_dict
+
+
+def calculate_rouge(
+    pred_lns: List[str],
+    tgt_lns: List[str],
+    use_stemmer=True,
+    rouge_keys=ROUGE_KEYS,
+    return_precision_and_recall=False,
+    bootstrap_aggregation=True,
+    newline_sep=True,
+) -> Dict:
+    """Calculate rouge using rouge_scorer package.
+
+    Args:
+        pred_lns: list of summaries generated by model
+        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
+        use_stemmer:  Bool indicating whether Porter stemmer should be used to
+        strip word suffixes to improve matching.
+        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
+        return_precision_and_recall: (False) whether to also return precision and recall.
+        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
+            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
+        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
+        on multi sentence summaries (CNN/DM dataset).
+
+    Returns:
+         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
+
+    """
+    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
+    aggregator = scoring.BootstrapAggregator()
+    for pred, tgt in zip(tgt_lns, pred_lns):
+        # rougeLsum expects "\n" separated sentences within a summary
+        if newline_sep:
+            pred = add_newline_to_end_of_each_sentence(pred)
+            tgt = add_newline_to_end_of_each_sentence(tgt)
+        scores = scorer.score(pred, tgt)
+        aggregator.add_scores(scores)
+
+    if bootstrap_aggregation:
+        result = aggregator.aggregate()
+        if return_precision_and_recall:
+            return extract_rouge_mid_statistics(result)  # here we return dict
+        else:
+            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
+
+    else:
+        return aggregator._scores  # here we return defaultdict(list)
+
+
+# Utilities for freezing parameters and checking whether they are frozen
+
+
+def freeze_params(model: nn.Module):
+    """Set requires_grad=False for each of model.parameters()"""
+    for par in model.parameters():
+        par.requires_grad = False
+
+
+def freeze_embeds(model):
+    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
+    model_type = model.config.model_type
+
+    if model_type == "t5":
+        freeze_params(model.shared)
+        for d in [model.encoder, model.decoder]:
+            freeze_params(d.embed_tokens)
+    elif model_type == "fsmt":
+        for d in [model.model.encoder, model.model.decoder]:
+            freeze_params(d.embed_positions)
+            freeze_params(d.embed_tokens)
+    else:
+        freeze_params(model.model.shared)
+        for d in [model.model.encoder, model.model.decoder]:
+            freeze_params(d.embed_positions)
+            freeze_params(d.embed_tokens)
+
+
+def grad_status(model: nn.Module) -> Iterable:
+    return (par.requires_grad for par in model.parameters())
+
+
+def any_requires_grad(model: nn.Module) -> bool:
+    return any(grad_status(model))
+
+
+def assert_all_frozen(model):
+    model_grads: List[bool] = list(grad_status(model))
+    n_require_grad = sum(lmap(int, model_grads))
+    npars = len(model_grads)
+    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+
+
+def assert_not_all_frozen(model):
+    model_grads: List[bool] = list(grad_status(model))
+    npars = len(model_grads)
+    assert any(model_grads), f"none of {npars} weights require grad"
+
+
+def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
+    """
+    Parse an argv list of unspecified command line args to a dict.
+    Assumes all values are either numeric or boolean in the form of true/false.
+    """
+    result = {}
+    assert len(unparsed_args) % 2 == 0, f"got odd number of unparsed args: {unparsed_args}"
+    num_pairs = len(unparsed_args) // 2
+    for pair_num in range(num_pairs):
+        i = 2 * pair_num
+        assert unparsed_args[i].startswith("--")
+        if unparsed_args[i + 1].lower() == "true":
+            value = True
+        elif unparsed_args[i + 1].lower() == "false":
+            value = False
+        else:
+            try:
+                value = int(unparsed_args[i + 1])
+            except ValueError:
+                value = float(unparsed_args[i + 1])  # this can raise another informative ValueError
+
+        result[unparsed_args[i][2:]] = value
+    return result
+
+
+def write_txt_file(ordered_tgt, path):
+    f = Path(path).open("w")
+    for ln in ordered_tgt:
+        f.write(ln + "\n")
+        f.flush()
+
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def check_output_dir(args, expected_items=0):
+    """
+    Checks whether to bail out if output_dir already exists and has more than expected_items in it
+
+    `args`: needs to have the following attributes of `args`:
+      - output_dir
+      - do_train
+      - overwrite_output_dir
+
+    `expected_items`: normally 0 (default) - i.e. empty dir, but in some cases a few files are expected (e.g. recovery from OOM)
+    """
+    if (
+        os.path.exists(args.output_dir)
+        and len(os.listdir(args.output_dir)) > expected_items
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({args.output_dir}) already exists and "
+            f"has {len(os.listdir(args.output_dir))} items in it (expected {expected_items} items). "
+            "Use --overwrite_output_dir to overcome."
+        )
--- a/examples/seq2seq/README.md
+++ b/examples/seq2seq/README.md
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 ## Sequence to Sequence Training and Evaluation

 This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
@@ -112,101 +128,6 @@ Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prep
 Future work/help wanted: A new dataset to support multilingual tasks.


-### Finetuning Scripts
-All finetuning bash scripts call finetune.py (or distillation.py) with reasonable command line arguments. They usually require extra command line arguments to work.
-
-To see all the possible command line options, run:
-
-```bash
-./finetune.py --help
-```
-
-### Finetuning Training Params
-
-To override the pretrained model's training params, you can pass them to `./finetune.sh`:
-
-```bash
-./finetune.sh \
-    [...]
-    --encoder_layerdrop 0.1 \
-    --decoder_layerdrop 0.1 \
-    --dropout 0.1 \
-    --attention_dropout 0.1 \
-```
-
-### Summarization Finetuning
-Run/modify `finetune.sh`
-
-The following command should work on a 16GB GPU:
-```bash
-./finetune.sh \
-    --data_dir $XSUM_DIR \
-    --train_batch_size=1 \
-    --eval_batch_size=1 \
-    --output_dir=xsum_results \
-    --num_train_epochs 6 \
-    --model_name_or_path facebook/bart-large
-```
-
-There is a starter finetuning script for pegasus at `finetune_pegasus_xsum.sh`.
-
-### Translation Finetuning
-
-First, follow the wmt_en_ro download instructions.
-Then you can finetune mbart_cc25 on english-romanian with the following command.
-**Recommendation:** Read and potentially modify the fairly opinionated defaults in `train_mbart_cc25_enro.sh` script before running it.
-
-Best performing command:
-```bash
-# optionally
-export ENRO_DIR='wmt_en_ro' # Download instructions above
-# export WANDB_PROJECT="MT" # optional
-export MAX_LEN=128
-export BS=4
-./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --label_smoothing 0.1 --fp16_opt_level=O1 --logger_name wandb --sortish_sampler
-```
-This should take < 6h/epoch on a 16GB v100 and achieve test BLEU above 26
-To get results in line with fairseq, you need to do some postprocessing. (see `romanian_postprocessing.md`)
-
-MultiGPU command
-(using 8 GPUS as an example)
-```bash
-export ENRO_DIR='wmt_en_ro' # Download instructions above
- # export WANDB_PROJECT="MT" # optional
-export MAX_LEN=128
-export BS=4
-./train_mbart_cc25_enro.sh --output_dir enro_finetune_baseline --gpus 8 --logger_name wandb
-```
-### Finetuning Outputs
-As you train, `output_dir` will be filled with files, that look kind of like this (comments are mine).
-Some of them are metrics, some of them are checkpoints, some of them are metadata. Here is a quick tour:
-
-```bash
-output_dir
-├── best_tfmr  # this is a huggingface checkpoint generated by save_pretrained. It is the same model as the PL .ckpt file below
-│   ├── config.json
-│   ├── merges.txt
-│   ├── pytorch_model.bin
-│   ├── special_tokens_map.json
-│   ├── tokenizer_config.json
-│   └── vocab.json
-├── git_log.json   # repo, branch, and commit hash
-├── val_avg_rouge2=0.1984-step_count=11.ckpt  # this is a pytorch lightning checkpoint associated with the best val score. (it will be called BLEU for MT)
-├── metrics.json  # new validation metrics will continually be appended to this
-├── student  # this is a huggingface checkpoint generated by SummarizationDistiller. It is the student before it gets finetuned.
-│   ├── config.json
-│   └── pytorch_model.bin
-├── test_generations.txt
-# ^^ are the summaries or translations produced by your best checkpoint on the test data. Populated when training is done
-├── test_results.txt  # a convenience file with the test set metrics. This data is also in metrics.json['test']
-├── hparams.pkl  # the command line args passed after some light preprocessing. Should be saved fairly quickly.
-```
-After training, you can recover the best checkpoint by running
-```python
-from transformers import AutoModelForSeq2SeqLM
-model = AutoModelForSeq2SeqLM.from_pretrained(f'{output_dir}/best_tfmr')
-```
-
 ### Fine-tuning using Seq2SeqTrainer
 To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.

@@ -242,190 +163,6 @@ The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-
 ./builtin_trainer/train_distil_marian_enro_tpu.sh
 ```

-# DistilBART
-<!---It should be called distilling bart and pegasus, but I don't want to break the link in the paper.-->
-This section describes all code and artifacts from our [Paper](http://arxiv.org/abs/2010.13002)
-
-![DBART](https://huggingface.co/front/thumbnails/distilbart_large.png)
-
-+ For the CNN/DailyMail dataset, (relatively longer, more extractive summaries), we found a simple technique that works, which we call "Shrink and Fine-tune", or SFT.
-you just copy alternating layers from `facebook/bart-large-cnn` and fine-tune more on the cnn/dm data. `sshleifer/distill-pegasus-cnn-16-4`, `sshleifer/distilbart-cnn-12-6` and all other checkpoints under `sshleifer` that start with `distilbart-cnn` were trained this way. 
-+ For the XSUM dataset, training on pseudo-labels worked best for Pegasus (`sshleifer/distill-pegasus-16-4`), while training with KD worked best for `distilbart-xsum-12-6`
-+ For `sshleifer/dbart-xsum-12-3`
-+ We ran 100s experiments, and didn't want to document 100s of commands. If you want a command to replicate a figure from the paper that is not documented below, feel free to ask on the [forums](https://discuss.huggingface.co/t/seq2seq-distillation-methodology-questions/1270) and tag `@sshleifer`. 
-+ You can see the performance tradeoffs of model sizes [here](https://docs.google.com/spreadsheets/d/1EkhDMwVO02m8jCD1cG3RoFPLicpcL1GQHTQjfvDYgIM/edit#gid=0).
-and more granular timing results [here](https://docs.google.com/spreadsheets/d/1EkhDMwVO02m8jCD1cG3RoFPLicpcL1GQHTQjfvDYgIM/edit#gid=1753259047&range=B2:I23).
-
-### Evaluation
-
-use [run_distributed_eval](./run_distributed_eval.py), with the following convenient alias
-```bash
-deval () {
-	proc=$1
-	m=$2
-	dd=$3
-	sd=$4
-	shift
-	shift
-	shift
-	shift
-	python -m torch.distributed.launch --nproc_per_node=$proc  run_distributed_eval.py \
-		--model_name $m  --save_dir $sd --data_dir $dd $@
-}
-```
-On a 1 GPU system, here are four commands (that assume `xsum`, `cnn_dm` are downloaded, cmd-F for those links in this file).
-
-`distilBART`:
-```bash
-deval 1 sshleifer/distilbart-xsum-12-3 xsum dbart_12_3_xsum_eval --fp16  # --help for more choices.
-deval 1 sshleifer/distilbart-cnn_dm-12-6 cnn_dm dbart_12_6_cnn_eval --fp16
-```
-
-`distill-pegasus`:
-```bash
-deval 1 sshleifer/distill-pegasus-cnn-16-4 cnn_dm dpx_cnn_eval
-deval 1 sshleifer/distill-pegasus-xsum-16-4 xsum dpx_xsum_eval
-```
-
-### Distillation
-+ For all of the following commands, you can get roughly equivalent result and faster run times by passing `--num_beams=4`. That's not what we did for the paper.
-+ Besides the KD section, you can also run commands with the built-in transformers trainer. See, for example, [builtin_trainer/train_distilbart_cnn.sh](./builtin_trainer/train_distilbart_cnn.sh).
-+ Large performance deviations (> 5X slower or more than 0.5 Rouge-2 worse), should be reported.
-+ Multi-gpu (controlled with `--gpus` should work, but might require more epochs).
-
-#### Recommended Workflow
-+ Get your dataset in the right format. (see 6 files above).
-+ Find a teacher model [Pegasus](https://huggingface.co/models?search=pegasus) (slower, better ROUGE) or `facebook/bart-large-xsum`/`facebook/bart-large-cnn` (faster, slightly lower.).
-Choose the checkpoint where the corresponding dataset is most similar (or identical to) your dataset.
-+ Follow the sections in order below. You can stop after SFT if you are satisfied, or move on to pseudo-labeling if you want more performance.
-+ student size: If you want a close to free 50% speedup, cut the decoder in half. If you want a larger speedup, cut it in 4. 
-+ If your SFT run starts at a validation ROUGE-2 that is more than 10 pts below the teacher's validation ROUGE-2,  you have a bug. Switching to a more expensive technique will not help. Try setting a breakpoint and looking at generation and truncation defaults/hyper-parameters, and share your experience on the forums!
-
-  
-#### Initialization
-We use [make_student.py](./make_student.py) to copy alternating layers from the teacher, and save the resulting model to disk
-```bash
-python make_student.py facebook/bart-large-xsum --save_path dbart_xsum_12_3  -e 12 -d 3
-```
-or for `pegasus-xsum`
-```bash
-python make_student.py google/pegasus-xsum --save_path dpx_xsum_16_4  --e 16 --d 4
-```
-we now have an initialized student saved to  `dbart_xsum_12_3`, which we will use for the following commands.
-+ Extension: To replicate more complicated initialize experiments in section 6.1, or try your own. Use the `create_student_by_copying_alternating_layers` function.
-
-#### Pegasus 
-+ The following commands are written for BART and will require, at minimum, the following modifications
-+ reduce batch size, and increase gradient accumulation steps so that the product `gpus * batch size * gradient_accumulation_steps = 256`. We used `--learning-rate` = 1e-4 * gradient accumulation steps.
-+ don't use fp16
-+ `--tokenizer_name google/pegasus-large`
-
-### SFT (No Teacher Distillation)
-You don't need `distillation.py`, you can just run:
-
-```bash
-python finetune.py \
-  --data_dir xsum \
-  --freeze_encoder --freeze_embeds \
-  --learning_rate=3e-4 \
-  --do_train \
-  --do_predict \
-  --fp16 --fp16_opt_level=O1 \
-  --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
-  --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-  --model_name_or_path dbart_xsum_12_3 \
-  --train_batch_size=64 --eval_batch_size=64 \
-  --sortish_sampler \
-  --num_train_epochs=6 \
-  --warmup_steps 500 \
-  --output_dir distilbart_xsum_sft_12_3 --gpus 1
-```
-
-+ Note: The command that produced `sshleifer/distilbart-cnn-12-6` is at [train_distilbart_cnn.sh](./[train_distilbart_cnn.sh)
-
-```bash
-./train_distilbart_cnn.sh
-```
-<!--- runtime: 6H on NVIDIA RTX 24GB GPU -->
-+ Tip: You can get the same simple distillation logic by using `distillation.py --no_teacher ` followed by identical arguments as the ones in `train_distilbart_cnn.sh`.
-If you are using `wandb` and comparing the two distillation methods, using this entry point will make your logs consistent,
-because you will have the same hyper-parameters logged in every run.
-
-### Pseudo-Labeling
-+ You don't need `distillation.py`.
-+ Instructions to generate pseudo-labels and use pre-computed pseudo-labels can be found [here](./precomputed_pseudo_labels.md).
-Simply run `finetune.py` with one of those pseudo-label datasets as `--data_dir` (`DATA`, below).
-
-```bash
-python finetune.py \
-  --teacher facebook/bart-large-xsum --data_dir DATA \
-  --freeze_encoder --freeze_embeds \
-  --learning_rate=3e-4 \
-  --do_train \
-  --do_predict \
-  --fp16 --fp16_opt_level=O1 \
-  --val_check_interval 0.1 --n_val 1000 --eval_beams 2 --length_penalty=0.5 \
-  --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-  --model_name_or_path dbart_xsum_12_3 \
-  --train_batch_size=32 --eval_batch_size=32 \
-  --sortish_sampler \
-  --num_train_epochs=5 \
-  --warmup_steps 500 \
-  --output_dir dbart_xsum_12_3_PL --gpus 1 --logger_name wandb
-```
-
- 
-
-To combine datasets, as in Section 6.2, try something like:
-```bash
-curl -S https://cdn-datasets.huggingface.co/pseudo/xsum/bart_xsum_pl.tgz | tar -xvz -C .
-curl -S https://cdn-datasets.huggingface.co/pseudo/xsum/pegasus_xsum.tgz | tar -xvz -C .
-curl -S https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz | tar -xvz -C .
-mkdir all_pl
-cat bart_xsum_pl/train.source pegasus_xsum/train.source xsum/train.source > all_pl/train.source
-cat bart_xsum_pl/train.target pegasus_xsum/train.target xsum/train.target > all_pl/train.target
-cp xsum/val* all_pl
-cp xsum/test* all_pl
-```
-then use `all_pl` as DATA in the command above.
-
-#### Direct Knowledge Distillation (KD)
-+ In this method, we use try to enforce that the student and teacher produce similar encoder_outputs, logits, and hidden_states using `SummarizationDistiller`.
-+ This method was used for `sshleifer/distilbart-xsum-12-6`, `6-6`, and `9-6` checkpoints were produced.
-+ You must use [`distillation.py`](./distillation.py). Note that this command initializes the student for you.
-
-The command that produced `sshleifer/distilbart-xsum-12-6` is at [./train_distilbart_xsum.sh](train_distilbart_xsum.sh)
-```bash
-./train_distilbart_xsum.sh --logger_name wandb --gpus 1
-```
-
-+ Expected ROUGE-2 between 21.3 and 21.6, run time ~13H.
-+ direct KD + Pegasus is VERY slow and works best with `--supervise_forward --normalize_hidden`.
-
-<!--- runtime: 13H on V-100 16GB GPU. -->
-
-### Citation
-
-```bibtex
-@misc{shleifer2020pretrained,
-      title={Pre-trained Summarization Distillation}, 
-      author={Sam Shleifer and Alexander M. Rush},
-      year={2020},
-      eprint={2010.13002},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-@article{Wolf2019HuggingFacesTS,
-  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
-  journal={ArXiv},
-  year={2019},
-  volume={abs/1910.03771}
-}
-```
-
-This is the end of the distillation section, the rest of this doc pertains to general seq2seq commands.
-
 ## Evaluation Commands

 To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.

--- a/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh
+++ b/examples/seq2seq/builtin_trainer/train_distilbart_cnn.sh
-export WANDB_PROJECT=distilbart-trainer
-export BS=32
-export m=sshleifer/student_cnn_12_6
-export tok=facebook/bart-large
-export MAX_TGT_LEN=142
-
-python finetune_trainer.py \
-    --model_name_or_path $m --tokenizer_name $tok \ 
-    --data_dir cnn_dm \
-    --output_dir distilbart-cnn-12-6 --overwrite_output_dir \
-    --learning_rate=3e-5 \
-    --warmup_steps 500 --sortish_sampler \
-    --fp16 \
-    --n_val 500 \
-    --gradient_accumulation_steps=1 \
-    --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \
-    --freeze_encoder --freeze_embeds \
-    --num_train_epochs=2 \
-    --save_steps 3000 --eval_steps 3000 \
-    --logging_first_step \
-    --max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
-    --predict_with_generate --sortish_sampler \
-    "$@"
--- a/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh
+++ b/examples/seq2seq/builtin_trainer/train_mbart_cc25_enro.sh
-python finetune_trainer.py \
-    --model_name_or_path=facebook/mbart-large-cc25 \
-    --data_dir $ENRO_DIR \
-    --output_dir mbart_cc25_enro --overwrite_output_dir \
-    --learning_rate=3e-5 \
-    --warmup_steps 500 \ 
-    --fp16 \
-    --label_smoothing 0.1 \
-    --adam_eps 1e-06 \
-    --src_lang en_XX --tgt_lang ro_RO \
-    --freeze_embeds \
-    --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \
-    --max_source_length 128 --max_target_length 128 \
-    --val_max_target_length 128 --test_max_target_length 128 \
-    --sortish_sampler \
-    --num_train_epochs 6 \
-    --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
-    --predict_with_generate --logging_first_step \
-    --task translation \
-    "$@"
--- a/examples/seq2seq/convert_model_to_fp16.py
+++ b/examples/seq2seq/convert_model_to_fp16.py
 #!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from typing import Union


--- a/examples/seq2seq/download_wmt.py
+++ b/examples/seq2seq/download_wmt.py
 #!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from pathlib import Path


--- a/examples/seq2seq/finetune.sh
+++ b/examples/seq2seq/finetune.sh
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
 # run ./finetune.sh --help to see all the possible options
-python finetune.py \
+python finetune_trainer.py \
    --learning_rate=3e-5 \
    --fp16 \
-    --gpus 1 \
-    --do_train \
-    --do_predict \
+    --do_train --do_eval --do_predict \
+    --evaluation_strategy steps \
+    --predict_with_generate \
    --n_val 1000 \
-    --val_check_interval 0.1 \
    "$@"
--- a/examples/seq2seq/builtin_trainer/finetune_tpu.sh
+++ b/examples/seq2seq/builtin_trainer/finetune_tpu.sh
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 export TPU_NUM_CORES=8

 # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
-# run ./builtin_trainer/finetune_tpu.sh --help to see all the possible options
+# run ./finetune_tpu.sh --help to see all the possible options
 python xla_spawn.py --num_cores $TPU_NUM_CORES \
    finetune_trainer.py \
    --learning_rate=3e-5 \

--- a/examples/seq2seq/finetune_trainer.py
+++ b/examples/seq2seq/finetune_trainer.py
 #!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import logging
 import os

--- a/examples/seq2seq/minify_dataset.py
+++ b/examples/seq2seq/minify_dataset.py
 #!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 from pathlib import Path


--- a/examples/seq2seq/pack_dataset.py
+++ b/examples/seq2seq/pack_dataset.py
 #!/usr/bin/env python
-
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Fill examples with bitext up to max_tokens without breaking up examples.
 [['I went', 'yo fui'],
 ['to the store', 'a la tienda']

--- a/examples/seq2seq/requirements.txt
+++ b/examples/seq2seq/requirements.txt
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.1.3
+fire
+pytest
+conllu
+sentencepiece != 0.1.92
+protobuf
--- a/examples/seq2seq/rouge_cli.py
+++ b/examples/seq2seq/rouge_cli.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import fire

 from utils import calculate_rouge, save_json

--- a/examples/seq2seq/run_distributed_eval.py
+++ b/examples/seq2seq/run_distributed_eval.py
 #!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import argparse
 import shutil

--- a/examples/seq2seq/run_eval.py
+++ b/examples/seq2seq/run_eval.py
 #!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

 import argparse
 import datetime