Reorganize examples (#9010)

* Reorganize example folder * Continue reorganization * Change requirements for tests * Final cleanup * Finish regroup with tests all passing * Copyright * Requirements and readme * Make a full link for the documentation * Address review comments * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add symlink * Reorg again * Apply suggestions from code review Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> * Adapt title * Update to new strucutre * Remove test * Update READMEs Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>

Reorganize examples (#9010)
* Reorganize example folder * Continue reorganization * Change requirements for tests * Final cleanup * Finish regroup with tests all passing * Copyright * Requirements and readme * Make a full link for the documentation * Address review comments * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Add symlink * Reorg again * Apply suggestions from code review Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com> * Adapt title * Update to new strucutre * Remove test * Update READMEs Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Thomas Wolf <thomwolf@users.noreply.github.com>
783d7d26 · Sylvain Gugger · GitHub · 86896de0 · 783d7d26 · 783d7d26
Unverified Commit 783d7d26 authored Dec 11, 2020 by Sylvain Gugger Committed by GitHub Dec 11, 2020
20 changed files
--- a/examples/seq2seq/test_bash_script.py
+++ b/examples/seq2seq/test_bash_script.py
--- a/examples/seq2seq/test_make_student.py
+++ b/examples/seq2seq/test_make_student.py
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+import argparse
+import logging
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+import pytest
+import pytorch_lightning as pl
+import torch
+
+import lightning_base
+from convert_pl_checkpoint_to_hf import convert_pl_to_hf
+from distillation import distill_main
+from finetune import SummarizationModule, main
+from parameterized import parameterized
+from run_eval import generate_summaries_or_translations
+from transformers import AutoConfig, AutoModelForSeq2SeqLM
+from transformers.hf_api import HfApi
+from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow
+from utils import label_smoothed_nll_loss, lmap, load_json
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+CUDA_AVAILABLE = torch.cuda.is_available()
+CHEAP_ARGS = {
+    "max_tokens_per_batch": None,
+    "supervise_forward": True,
+    "normalize_hidden": True,
+    "label_smoothing": 0.2,
+    "eval_max_gen_length": None,
+    "eval_beams": 1,
+    "val_metric": "loss",
+    "save_top_k": 1,
+    "adafactor": True,
+    "early_stopping_patience": 2,
+    "logger_name": "default",
+    "length_penalty": 0.5,
+    "cache_dir": "",
+    "task": "summarization",
+    "num_workers": 2,
+    "alpha_hid": 0,
+    "freeze_embeds": True,
+    "enc_only": False,
+    "tgt_suffix": "",
+    "resume_from_checkpoint": None,
+    "sortish_sampler": True,
+    "student_decoder_layers": 1,
+    "val_check_interval": 1.0,
+    "output_dir": "",
+    "fp16": False,  # TODO(SS): set this to CUDA_AVAILABLE if ci installs apex or start using native amp
+    "no_teacher": False,
+    "fp16_opt_level": "O1",
+    "gpus": 1 if CUDA_AVAILABLE else 0,
+    "n_tpu_cores": 0,
+    "max_grad_norm": 1.0,
+    "do_train": True,
+    "do_predict": True,
+    "accumulate_grad_batches": 1,
+    "server_ip": "",
+    "server_port": "",
+    "seed": 42,
+    "model_name_or_path": "sshleifer/bart-tiny-random",
+    "config_name": "",
+    "tokenizer_name": "facebook/bart-large",
+    "do_lower_case": False,
+    "learning_rate": 0.3,
+    "lr_scheduler": "linear",
+    "weight_decay": 0.0,
+    "adam_epsilon": 1e-08,
+    "warmup_steps": 0,
+    "max_epochs": 1,
+    "train_batch_size": 2,
+    "eval_batch_size": 2,
+    "max_source_length": 12,
+    "max_target_length": 12,
+    "val_max_target_length": 12,
+    "test_max_target_length": 12,
+    "fast_dev_run": False,
+    "no_cache": False,
+    "n_train": -1,
+    "n_val": -1,
+    "n_test": -1,
+    "student_encoder_layers": 1,
+    "freeze_encoder": False,
+    "auto_scale_batch_size": False,
+    "overwrite_output_dir": False,
+    "student": None,
+}
+
+
+def _dump_articles(path: Path, articles: list):
+    content = "\n".join(articles)
+    Path(path).open("w").writelines(content)
+
+
+ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
+SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
+T5_TINY = "patrickvonplaten/t5-tiny-random"
+T5_TINIER = "sshleifer/t5-tinier-random"
+BART_TINY = "sshleifer/bart-tiny-random"
+MBART_TINY = "sshleifer/tiny-mbart"
+MARIAN_TINY = "sshleifer/tiny-marian-en-de"
+FSMT_TINY = "stas/tiny-wmt19-en-de"
+
+
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
+
+
+def make_test_data_dir(tmp_dir):
+    for split in ["train", "val", "test"]:
+        _dump_articles(os.path.join(tmp_dir, f"{split}.source"), ARTICLES)
+        _dump_articles(os.path.join(tmp_dir, f"{split}.target"), SUMMARIES)
+    return tmp_dir
+
+
+class TestSummarizationDistiller(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
+        return cls
+
+    @slow
+    @require_torch_gpu
+    def test_hub_configs(self):
+        """I put require_torch_gpu cause I only want this to run with self-scheduled."""
+
+        model_list = HfApi().model_list()
+        org = "sshleifer"
+        model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+        allowed_to_be_broken = ["sshleifer/blenderbot-3B", "sshleifer/blenderbot-90M"]
+        failures = []
+        for m in model_ids:
+            if m in allowed_to_be_broken:
+                continue
+            try:
+                AutoConfig.from_pretrained(m)
+            except Exception:
+                failures.append(m)
+        assert not failures, f"The following models could not be loaded through AutoConfig: {failures}"
+
+    def test_distill_no_teacher(self):
+        updates = dict(student_encoder_layers=2, student_decoder_layers=1, no_teacher=True)
+        self._test_distiller_cli(updates)
+
+    def test_distill_checkpointing_with_teacher(self):
+        updates = dict(
+            student_encoder_layers=2,
+            student_decoder_layers=1,
+            max_epochs=4,
+            val_check_interval=0.25,
+            alpha_hid=2.0,
+            model_name_or_path="IGNORE_THIS_IT_DOESNT_GET_USED",
+        )
+        model = self._test_distiller_cli(updates, check_contents=False)
+
+        ckpts = list(Path(model.output_dir).glob("*.ckpt"))
+        self.assertEqual(1, len(ckpts))
+        transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin"))
+        self.assertEqual(len(transformer_ckpts), 2)
+        examples = lmap(str.strip, Path(model.hparams.data_dir).joinpath("test.source").open().readlines())
+        out_path = tempfile.mktemp()  # XXX: not being cleaned up
+        generate_summaries_or_translations(examples, out_path, str(model.output_dir / "best_tfmr"))
+        self.assertTrue(Path(out_path).exists())
+
+        out_path_new = self.get_auto_remove_tmp_dir()
+        convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new)
+        assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))
+
+    def test_loss_fn(self):
+        model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY)
+        input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"]
+        target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device)
+        decoder_input_ids = target_ids[:, :-1].contiguous()  # Why this line?
+        lm_labels = target_ids[:, 1:].clone()  # why clone?
+        model_computed_loss = model(
+            input_ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, labels=lm_labels, use_cache=False
+        ).loss
+
+        logits = model(input_ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, use_cache=False).logits
+
+        lprobs = torch.nn.functional.log_softmax(logits, dim=-1)
+        smoothed_loss, nll_loss = label_smoothed_nll_loss(
+            lprobs, lm_labels, 0.1, ignore_index=model.config.pad_token_id
+        )
+        with self.assertRaises(AssertionError):
+            # TODO: understand why this breaks
+            self.assertEqual(nll_loss, model_computed_loss)
+
+    def test_distill_mbart(self):
+        updates = dict(
+            student_encoder_layers=2,
+            student_decoder_layers=1,
+            num_train_epochs=4,
+            val_check_interval=0.25,
+            alpha_hid=2.0,
+            task="translation",
+            model_name_or_path="IGNORE_THIS_IT_DOESNT_GET_USED",
+            tokenizer_name=MBART_TINY,
+            teacher=MBART_TINY,
+            src_lang="en_XX",
+            tgt_lang="ro_RO",
+        )
+        model = self._test_distiller_cli(updates, check_contents=False)
+        assert model.model.config.model_type == "mbart"
+
+        ckpts = list(Path(model.output_dir).glob("*.ckpt"))
+        self.assertEqual(1, len(ckpts))
+        transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin"))
+        all_files = list(Path(model.output_dir).glob("best_tfmr/*"))
+        assert len(all_files) > 2
+        self.assertEqual(len(transformer_ckpts), 2)
+
+    def test_distill_t5(self):
+        updates = dict(
+            student_encoder_layers=1,
+            student_decoder_layers=1,
+            alpha_hid=2.0,
+            teacher=T5_TINY,
+            model_name_or_path=T5_TINY,
+            tokenizer_name=T5_TINY,
+        )
+        self._test_distiller_cli(updates)
+
+    def test_distill_different_base_models(self):
+        updates = dict(
+            teacher=T5_TINY,
+            student=T5_TINIER,
+            model_name_or_path=T5_TINIER,
+            tokenizer_name=T5_TINIER,
+        )
+        self._test_distiller_cli(updates)
+
+    def _test_distiller_cli(self, updates, check_contents=True):
+        default_updates = dict(
+            label_smoothing=0.0,
+            early_stopping_patience=-1,
+            train_batch_size=1,
+            eval_batch_size=2,
+            max_epochs=2,
+            alpha_mlm=0.2,
+            alpha_ce=0.8,
+            do_predict=True,
+            model_name_or_path="sshleifer/tinier_bart",
+            teacher=CHEAP_ARGS["model_name_or_path"],
+            val_check_interval=0.5,
+        )
+        default_updates.update(updates)
+        args_d: dict = CHEAP_ARGS.copy()
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+        output_dir = self.get_auto_remove_tmp_dir()
+
+        args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates)
+        model = distill_main(argparse.Namespace(**args_d))
+        if not check_contents:
+            return model
+        contents = os.listdir(output_dir)
+        contents = {os.path.basename(p) for p in contents}
+        ckpt_files = [p for p in contents if p.endswith("ckpt")]
+        assert len(ckpt_files) > 0
+
+        self.assertIn("test_generations.txt", contents)
+        self.assertIn("test_results.txt", contents)
+
+        metrics = load_json(model.metrics_save_path)
+        last_step_stats = metrics["val"][-1]
+        self.assertGreaterEqual(last_step_stats["val_avg_gen_time"], 0.01)
+        self.assertGreaterEqual(1.0, last_step_stats["val_avg_gen_time"])
+        self.assertIsInstance(last_step_stats[f"val_avg_{model.val_metric}"], float)
+        desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) + 1)
+        self.assertEqual(len(metrics["val"]), desired_n_evals)
+        self.assertEqual(len(metrics["test"]), 1)
+        return model
+
+
+class TestTheRest(TestCasePlus):
+    @parameterized.expand(
+        [T5_TINY, BART_TINY, MBART_TINY, MARIAN_TINY, FSMT_TINY],
+    )
+    def test_finetune(self, model):
+        args_d: dict = CHEAP_ARGS.copy()
+        task = "translation" if model in [MBART_TINY, MARIAN_TINY, FSMT_TINY] else "summarization"
+        args_d["label_smoothing"] = 0.1 if task == "translation" else 0
+
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+        output_dir = self.get_auto_remove_tmp_dir()
+        args_d.update(
+            data_dir=tmp_dir,
+            model_name_or_path=model,
+            tokenizer_name=None,
+            train_batch_size=2,
+            eval_batch_size=2,
+            output_dir=output_dir,
+            do_predict=True,
+            task=task,
+            src_lang="en_XX",
+            tgt_lang="ro_RO",
+            freeze_encoder=True,
+            freeze_embeds=True,
+        )
+        assert "n_train" in args_d
+        args = argparse.Namespace(**args_d)
+        module = main(args)
+
+        input_embeds = module.model.get_input_embeddings()
+        assert not input_embeds.weight.requires_grad
+        if model == T5_TINY:
+            lm_head = module.model.lm_head
+            assert not lm_head.weight.requires_grad
+            assert (lm_head.weight == input_embeds.weight).all().item()
+        elif model == FSMT_TINY:
+            fsmt = module.model.model
+            embed_pos = fsmt.decoder.embed_positions
+            assert not embed_pos.weight.requires_grad
+            assert not fsmt.decoder.embed_tokens.weight.requires_grad
+            # check that embeds are not the same
+            assert fsmt.decoder.embed_tokens != fsmt.encoder.embed_tokens
+        else:
+            bart = module.model.model
+            embed_pos = bart.decoder.embed_positions
+            assert not embed_pos.weight.requires_grad
+            assert not bart.shared.weight.requires_grad
+            # check that embeds are the same
+            assert bart.decoder.embed_tokens == bart.encoder.embed_tokens
+            assert bart.decoder.embed_tokens == bart.shared
+
+        example_batch = load_json(module.output_dir / "text_batch.json")
+        assert isinstance(example_batch, dict)
+        assert len(example_batch) >= 4
+
+    def test_finetune_extra_model_args(self):
+        args_d: dict = CHEAP_ARGS.copy()
+
+        task = "summarization"
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+
+        args_d.update(
+            data_dir=tmp_dir,
+            tokenizer_name=None,
+            train_batch_size=2,
+            eval_batch_size=2,
+            do_predict=False,
+            task=task,
+            src_lang="en_XX",
+            tgt_lang="ro_RO",
+            freeze_encoder=True,
+            freeze_embeds=True,
+        )
+
+        # test models whose config includes the extra_model_args
+        model = BART_TINY
+        output_dir = self.get_auto_remove_tmp_dir()
+        args_d1 = args_d.copy()
+        args_d1.update(
+            model_name_or_path=model,
+            output_dir=output_dir,
+        )
+        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
+        for p in extra_model_params:
+            args_d1[p] = 0.5
+        args = argparse.Namespace(**args_d1)
+        model = main(args)
+        for p in extra_model_params:
+            assert getattr(model.config, p) == 0.5, f"failed to override the model config for param {p}"
+
+        # test models whose config doesn't include the extra_model_args
+        model = T5_TINY
+        output_dir = self.get_auto_remove_tmp_dir()
+        args_d2 = args_d.copy()
+        args_d2.update(
+            model_name_or_path=model,
+            output_dir=output_dir,
+        )
+        unsupported_param = "encoder_layerdrop"
+        args_d2[unsupported_param] = 0.5
+        args = argparse.Namespace(**args_d2)
+        with pytest.raises(Exception) as excinfo:
+            model = main(args)
+        assert str(excinfo.value) == f"model config doesn't have a `{unsupported_param}` attribute"
+
+    def test_finetune_lr_schedulers(self):
+        args_d: dict = CHEAP_ARGS.copy()
+
+        task = "summarization"
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+
+        model = BART_TINY
+        output_dir = self.get_auto_remove_tmp_dir()
+
+        args_d.update(
+            data_dir=tmp_dir,
+            model_name_or_path=model,
+            output_dir=output_dir,
+            tokenizer_name=None,
+            train_batch_size=2,
+            eval_batch_size=2,
+            do_predict=False,
+            task=task,
+            src_lang="en_XX",
+            tgt_lang="ro_RO",
+            freeze_encoder=True,
+            freeze_embeds=True,
+        )
+
+        # emulate finetune.py
+        parser = argparse.ArgumentParser()
+        parser = pl.Trainer.add_argparse_args(parser)
+        parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
+        args = {"--help": True}
+
+        # --help test
+        with pytest.raises(SystemExit) as excinfo:
+            with CaptureStdout() as cs:
+                args = parser.parse_args(args)
+            assert False, "--help is expected to sys.exit"
+        assert excinfo.type == SystemExit
+        expected = lightning_base.arg_to_scheduler_metavar
+        assert expected in cs.out, "--help is expected to list the supported schedulers"
+
+        # --lr_scheduler=non_existing_scheduler test
+        unsupported_param = "non_existing_scheduler"
+        args = {f"--lr_scheduler={unsupported_param}"}
+        with pytest.raises(SystemExit) as excinfo:
+            with CaptureStderr() as cs:
+                args = parser.parse_args(args)
+            assert False, "invalid argument is expected to sys.exit"
+        assert excinfo.type == SystemExit
+        expected = f"invalid choice: '{unsupported_param}'"
+        assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}"
+
+        # --lr_scheduler=existing_scheduler test
+        supported_param = "cosine"
+        args_d1 = args_d.copy()
+        args_d1["lr_scheduler"] = supported_param
+        args = argparse.Namespace(**args_d1)
+        model = main(args)
+        assert (
+            getattr(model.hparams, "lr_scheduler") == supported_param
+        ), f"lr_scheduler={supported_param} shouldn't fail"
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
+# as due to their complexity multi-gpu tests could impact other tests, and to aid debug we have those in a separate module.
+
+import os
+import sys
+from pathlib import Path
+
+import torch
+
+from transformers.testing_utils import TestCasePlus, execute_subprocess_async, require_torch_multi_gpu
+from utils import load_json
+
+
+CUDA_AVAILABLE = torch.cuda.is_available()
+ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
+SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
+CHEAP_ARGS = {
+    "max_tokens_per_batch": None,
+    "supervise_forward": True,
+    "normalize_hidden": True,
+    "label_smoothing": 0.2,
+    "eval_max_gen_length": None,
+    "eval_beams": 1,
+    "val_metric": "loss",
+    "save_top_k": 1,
+    "adafactor": True,
+    "early_stopping_patience": 2,
+    "logger_name": "default",
+    "length_penalty": 0.5,
+    "cache_dir": "",
+    "task": "summarization",
+    "num_workers": 2,
+    "alpha_hid": 0,
+    "freeze_embeds": True,
+    "enc_only": False,
+    "tgt_suffix": "",
+    "resume_from_checkpoint": None,
+    "sortish_sampler": True,
+    "student_decoder_layers": 1,
+    "val_check_interval": 1.0,
+    "output_dir": "",
+    "fp16": False,  # TODO(SS): set this to CUDA_AVAILABLE if ci installs apex or start using native amp
+    "no_teacher": False,
+    "fp16_opt_level": "O1",
+    "gpus": 1 if CUDA_AVAILABLE else 0,
+    "n_tpu_cores": 0,
+    "max_grad_norm": 1.0,
+    "do_train": True,
+    "do_predict": True,
+    "accumulate_grad_batches": 1,
+    "server_ip": "",
+    "server_port": "",
+    "seed": 42,
+    "model_name_or_path": "sshleifer/bart-tiny-random",
+    "config_name": "",
+    "tokenizer_name": "facebook/bart-large",
+    "do_lower_case": False,
+    "learning_rate": 0.3,
+    "lr_scheduler": "linear",
+    "weight_decay": 0.0,
+    "adam_epsilon": 1e-08,
+    "warmup_steps": 0,
+    "max_epochs": 1,
+    "train_batch_size": 2,
+    "eval_batch_size": 2,
+    "max_source_length": 12,
+    "max_target_length": 12,
+    "val_max_target_length": 12,
+    "test_max_target_length": 12,
+    "fast_dev_run": False,
+    "no_cache": False,
+    "n_train": -1,
+    "n_val": -1,
+    "n_test": -1,
+    "student_encoder_layers": 1,
+    "freeze_encoder": False,
+    "auto_scale_batch_size": False,
+    "overwrite_output_dir": False,
+    "student": None,
+}
+
+
+def _dump_articles(path: Path, articles: list):
+    content = "\n".join(articles)
+    Path(path).open("w").writelines(content)
+
+
+def make_test_data_dir(tmp_dir):
+    for split in ["train", "val", "test"]:
+        _dump_articles(os.path.join(tmp_dir, f"{split}.source"), ARTICLES)
+        _dump_articles(os.path.join(tmp_dir, f"{split}.target"), SUMMARIES)
+    return tmp_dir
+
+
+class TestSummarizationDistillerMultiGPU(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        return cls
+
+    @require_torch_multi_gpu
+    def test_multi_gpu(self):
+
+        updates = dict(
+            no_teacher=True,
+            freeze_encoder=True,
+            gpus=2,
+            overwrite_output_dir=True,
+            sortish_sampler=True,
+        )
+        self._test_distiller_cli_fork(updates, check_contents=False)
+
+    def _test_distiller_cli_fork(self, updates, check_contents=True):
+        default_updates = dict(
+            label_smoothing=0.0,
+            early_stopping_patience=-1,
+            train_batch_size=1,
+            eval_batch_size=2,
+            max_epochs=2,
+            alpha_mlm=0.2,
+            alpha_ce=0.8,
+            do_predict=True,
+            model_name_or_path="sshleifer/tinier_bart",
+            teacher=CHEAP_ARGS["model_name_or_path"],
+            val_check_interval=0.5,
+        )
+        default_updates.update(updates)
+        args_d: dict = CHEAP_ARGS.copy()
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+        output_dir = self.get_auto_remove_tmp_dir()
+        args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates)
+
+        def convert(k, v):
+            if k in ["tgt_suffix", "server_ip", "server_port", "out", "n_tpu_cores"]:
+                return ""
+            if v is False or v is None:
+                return ""
+            if v is True:  # or len(str(v))==0:
+                return f"--{k}"
+            return f"--{k}={v}"
+
+        cli_args = [x for x in (convert(k, v) for k, v in args_d.items()) if len(x)]
+        cmd = [sys.executable, f"{self.test_file_dir}/distillation.py"] + cli_args
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        contents = os.listdir(output_dir)
+        contents = {os.path.basename(p) for p in contents}
+        ckpt_files = [p for p in contents if p.endswith("ckpt")]
+        assert len(ckpt_files) > 0
+
+        self.assertIn("test_generations.txt", contents)
+        self.assertIn("test_results.txt", contents)
+
+        # get the following from the module, (we don't have access to `model` here)
+        metrics_save_path = os.path.join(output_dir, "metrics.json")
+        val_metric = "rouge2"
+
+        metrics = load_json(metrics_save_path)
+        # {'test': [{'test_avg_loss': 10.63731575012207, 'test_avg_rouge1': 0.0, 'test_avg_rouge2': 0.0, 'test_avg_rougeL': 0.0, 'test_avg_gen_time': 0.1822289228439331, 'test_avg_gen_len': 142.0, 'step_count': 1}]}
+        print(metrics)
+        last_step_stats = metrics["val"][-1]
+        self.assertGreaterEqual(last_step_stats["val_avg_gen_time"], 0.01)
+        self.assertIsInstance(last_step_stats[f"val_avg_{val_metric}"], float)
+        self.assertEqual(len(metrics["test"]), 1)
+        desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1)
+        self.assertEqual(len(metrics["val"]), desired_n_evals)
--- a/examples/seq2seq/callbacks.py
+++ b/examples/seq2seq/callbacks.py
--- a/examples/seq2seq/convert_pl_checkpoint_to_hf.py
+++ b/examples/seq2seq/convert_pl_checkpoint_to_hf.py
--- a/examples/seq2seq/distil_marian_enro_teacher.sh
+++ b/examples/seq2seq/distil_marian_enro_teacher.sh
--- a/examples/seq2seq/distil_marian_no_teacher.sh
+++ b/examples/seq2seq/distil_marian_no_teacher.sh
--- a/examples/seq2seq/distillation.py
+++ b/examples/seq2seq/distillation.py
--- a/examples/seq2seq/dynamic_bs_example.sh
+++ b/examples/seq2seq/dynamic_bs_example.sh
--- a/examples/seq2seq/finetune.py
+++ b/examples/seq2seq/finetune.py
--- a/examples/seq2seq/builtin_trainer/finetune.sh
+++ b/examples/seq2seq/builtin_trainer/finetune.sh
 # the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
-# run ./builtin_trainer/finetune.sh --help to see all the possible options
-python finetune_trainer.py \
+# run ./finetune.sh --help to see all the possible options
+python finetune.py \
    --learning_rate=3e-5 \
    --fp16 \
-    --do_train --do_eval --do_predict \
-    --evaluation_strategy steps \
-    --predict_with_generate \
+    --gpus 1 \
+    --do_train \
+    --do_predict \
    --n_val 1000 \
+    --val_check_interval 0.1 \
    "$@"
--- a/examples/seq2seq/finetune_bart_tiny.sh
+++ b/examples/seq2seq/finetune_bart_tiny.sh
--- a/examples/seq2seq/finetune_pegasus_xsum.sh
+++ b/examples/seq2seq/finetune_pegasus_xsum.sh
--- a/examples/seq2seq/finetune_t5.sh
+++ b/examples/seq2seq/finetune_t5.sh
--- a/examples/research_projects/seq2seq-distillation/lightning_base.py
+++ b/examples/research_projects/seq2seq-distillation/lightning_base.py
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_info
+
+from transformers import (
+    AdamW,
+    AutoConfig,
+    AutoModel,
+    AutoModelForPreTraining,
+    AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
+from transformers.optimization import (
+    Adafactor,
+    get_cosine_schedule_with_warmup,
+    get_cosine_with_hard_restarts_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+    get_polynomial_decay_schedule_with_warmup,
+)
+from transformers.utils.versions import require_version_examples
+
+
+logger = logging.getLogger(__name__)
+
+require_version_examples("pytorch_lightning>=1.0.4")
+
+MODEL_MODES = {
+    "base": AutoModel,
+    "sequence-classification": AutoModelForSequenceClassification,
+    "question-answering": AutoModelForQuestionAnswering,
+    "pretraining": AutoModelForPreTraining,
+    "token-classification": AutoModelForTokenClassification,
+    "language-modeling": AutoModelWithLMHead,
+    "summarization": AutoModelForSeq2SeqLM,
+    "translation": AutoModelForSeq2SeqLM,
+}
+
+
+# update this and the import above to support new schedulers from transformers.optimization
+arg_to_scheduler = {
+    "linear": get_linear_schedule_with_warmup,
+    "cosine": get_cosine_schedule_with_warmup,
+    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
+    "polynomial": get_polynomial_decay_schedule_with_warmup,
+    # '': get_constant_schedule,             # not supported for now
+    # '': get_constant_schedule_with_warmup, # not supported for now
+}
+arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
+arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
+
+
+class BaseTransformer(pl.LightningModule):
+    def __init__(
+        self,
+        hparams: argparse.Namespace,
+        num_labels=None,
+        mode="base",
+        config=None,
+        tokenizer=None,
+        model=None,
+        **config_kwargs
+    ):
+        """Initialize a model, tokenizer and config."""
+        super().__init__()
+        # TODO: move to self.save_hyperparameters()
+        # self.save_hyperparameters()
+        # can also expand arguments into trainer signature for easier reading
+
+        self.save_hyperparameters(hparams)
+        self.step_count = 0
+        self.output_dir = Path(self.hparams.output_dir)
+        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
+        if config is None:
+            self.config = AutoConfig.from_pretrained(
+                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
+                **({"num_labels": num_labels} if num_labels is not None else {}),
+                cache_dir=cache_dir,
+                **config_kwargs,
+            )
+        else:
+            self.config: PretrainedConfig = config
+
+        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
+        for p in extra_model_params:
+            if getattr(self.hparams, p, None):
+                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
+                setattr(self.config, p, getattr(self.hparams, p))
+
+        if tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
+                cache_dir=cache_dir,
+            )
+        else:
+            self.tokenizer: PreTrainedTokenizer = tokenizer
+        self.model_type = MODEL_MODES[mode]
+        if model is None:
+            self.model = self.model_type.from_pretrained(
+                self.hparams.model_name_or_path,
+                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
+                config=self.config,
+                cache_dir=cache_dir,
+            )
+        else:
+            self.model = model
+
+    def load_hf_checkpoint(self, *args, **kwargs):
+        self.model = self.model_type.from_pretrained(*args, **kwargs)
+
+    def get_lr_scheduler(self):
+        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
+        scheduler = get_schedule_func(
+            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
+        )
+        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
+        return scheduler
+
+    def configure_optimizers(self):
+        """Prepare optimizer and schedule (linear warmup and decay)"""
+        model = self.model
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.hparams.weight_decay,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        if self.hparams.adafactor:
+            optimizer = Adafactor(
+                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
+            )
+
+        else:
+            optimizer = AdamW(
+                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
+            )
+        self.opt = optimizer
+
+        scheduler = self.get_lr_scheduler()
+
+        return [optimizer], [scheduler]
+
+    def test_step(self, batch, batch_nb):
+        return self.validation_step(batch, batch_nb)
+
+    def test_epoch_end(self, outputs):
+        return self.validation_end(outputs)
+
+    def total_steps(self) -> int:
+        """The number of total training steps that will be run. Used for lr scheduler purposes."""
+        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
+        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
+        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
+
+    def setup(self, mode):
+        if mode == "test":
+            self.dataset_size = len(self.test_dataloader().dataset)
+        else:
+            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
+            self.dataset_size = len(self.train_dataloader().dataset)
+
+    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
+        raise NotImplementedError("You must implement this for your task")
+
+    def train_dataloader(self):
+        return self.train_loader
+
+    def val_dataloader(self):
+        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
+
+    def test_dataloader(self):
+        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
+
+    def _feature_file(self, mode):
+        return os.path.join(
+            self.hparams.data_dir,
+            "cached_{}_{}_{}".format(
+                mode,
+                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
+                str(self.hparams.max_seq_length),
+            ),
+        )
+
+    @pl.utilities.rank_zero_only
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        save_path = self.output_dir.joinpath("best_tfmr")
+        self.model.config.save_step = self.step_count
+        self.model.save_pretrained(save_path)
+        self.tokenizer.save_pretrained(save_path)
+
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        parser.add_argument(
+            "--model_name_or_path",
+            default=None,
+            type=str,
+            required=True,
+            help="Path to pretrained model or model identifier from huggingface.co/models",
+        )
+        parser.add_argument(
+            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+        )
+        parser.add_argument(
+            "--tokenizer_name",
+            default=None,
+            type=str,
+            help="Pretrained tokenizer name or path if not the same as model_name",
+        )
+        parser.add_argument(
+            "--cache_dir",
+            default="",
+            type=str,
+            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
+        )
+        parser.add_argument(
+            "--encoder_layerdrop",
+            type=float,
+            help="Encoder layer dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--decoder_layerdrop",
+            type=float,
+            help="Decoder layer dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--dropout",
+            type=float,
+            help="Dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--attention_dropout",
+            type=float,
+            help="Attention dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+        parser.add_argument(
+            "--lr_scheduler",
+            default="linear",
+            choices=arg_to_scheduler_choices,
+            metavar=arg_to_scheduler_metavar,
+            type=str,
+            help="Learning rate scheduler",
+        )
+        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
+        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
+        parser.add_argument("--train_batch_size", default=32, type=int)
+        parser.add_argument("--eval_batch_size", default=32, type=int)
+        parser.add_argument("--adafactor", action="store_true")
+
+
+class LoggingCallback(pl.Callback):
+    def on_batch_end(self, trainer, pl_module):
+        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
+        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
+        pl_module.logger.log_metrics(lrs)
+
+    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        rank_zero_info("***** Validation results *****")
+        metrics = trainer.callback_metrics
+        # Log results
+        for key in sorted(metrics):
+            if key not in ["log", "progress_bar"]:
+                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
+
+    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        rank_zero_info("***** Test results *****")
+        metrics = trainer.callback_metrics
+        # Log and save results to file
+        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
+        with open(output_test_results_file, "w") as writer:
+            for key in sorted(metrics):
+                if key not in ["log", "progress_bar"]:
+                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
+                    writer.write("{} = {}\n".format(key, str(metrics[key])))
+
+
+def add_generic_args(parser, root_dir) -> None:
+    #  To allow all pl args uncomment the following line
+    #  parser = pl.Trainer.add_argparse_args(parser)
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O2",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
+    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        dest="accumulate_grad_batches",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+
+
+def generic_train(
+    model: BaseTransformer,
+    args: argparse.Namespace,
+    early_stopping_callback=None,
+    logger=True,  # can pass WandbLogger() here
+    extra_callbacks=[],
+    checkpoint_callback=None,
+    logging_callback=None,
+    **extra_train_kwargs
+):
+    pl.seed_everything(args.seed)
+
+    # init model
+    odir = Path(model.hparams.output_dir)
+    odir.mkdir(exist_ok=True)
+
+    # add custom checkpoints
+    if checkpoint_callback is None:
+        checkpoint_callback = pl.callbacks.ModelCheckpoint(
+            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
+        )
+    if early_stopping_callback:
+        extra_callbacks.append(early_stopping_callback)
+    if logging_callback is None:
+        logging_callback = LoggingCallback()
+
+    train_params = {}
+
+    # TODO: remove with PyTorch 1.6 since pl uses native amp
+    if args.fp16:
+        train_params["precision"] = 16
+        train_params["amp_level"] = args.fp16_opt_level
+
+    if args.gpus > 1:
+        train_params["distributed_backend"] = "ddp"
+
+    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
+    train_params["accelerator"] = extra_train_kwargs.get("accelerator", None)
+    train_params["profiler"] = extra_train_kwargs.get("profiler", None)
+
+    trainer = pl.Trainer.from_argparse_args(
+        args,
+        weights_summary=None,
+        callbacks=[logging_callback] + extra_callbacks,
+        logger=logger,
+        checkpoint_callback=checkpoint_callback,
+        **train_params,
+    )
+
+    if args.do_train:
+        trainer.fit(model)
+
+    return trainer
--- a/examples/seq2seq/make_student.py
+++ b/examples/seq2seq/make_student.py
--- a/examples/seq2seq/precomputed_pseudo_labels.md
+++ b/examples/seq2seq/precomputed_pseudo_labels.md
--- a/examples/research_projects/seq2seq-distillation/requirements.txt
+++ b/examples/research_projects/seq2seq-distillation/requirements.txt
+tensorboard
+scikit-learn
+psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+pytorch-lightning==1.0.4
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.1.3
+fire
+pytest
+conllu
+sentencepiece != 0.1.92
+protobuf
--- a/examples/research_projects/seq2seq-distillation/run_eval.py
+++ b/examples/research_projects/seq2seq-distillation/run_eval.py
+#!/usr/bin/env python
+
+import argparse
+import datetime
+import json
+import time
+import warnings
+from logging import getLogger
+from pathlib import Path
+from typing import Dict, List
+
+import torch
+from tqdm import tqdm
+
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from utils import calculate_bleu, calculate_rouge, chunks, parse_numeric_n_bool_cl_kwargs, use_task_specific_params
+
+
+logger = getLogger(__name__)
+
+
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def generate_summaries_or_translations(
+    examples: List[str],
+    out_file: str,
+    model_name: str,
+    batch_size: int = 8,
+    device: str = DEFAULT_DEVICE,
+    fp16=False,
+    task="summarization",
+    prefix=None,
+    **generate_kwargs,
+) -> Dict:
+    """Save model.generate results to <out_file>, and return how long it took."""
+    fout = Path(out_file).open("w", encoding="utf-8")
+    model_name = str(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+    if fp16:
+        model = model.half()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
+
+    start_time = time.time()
+    # update config with task specific params
+    use_task_specific_params(model, task)
+    if prefix is None:
+        prefix = prefix or getattr(model.config, "prefix", "") or ""
+    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
+        examples_chunk = [prefix + text for text in examples_chunk]
+        batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device)
+        summaries = model.generate(
+            input_ids=batch.input_ids,
+            attention_mask=batch.attention_mask,
+            **generate_kwargs,
+        )
+        dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        for hypothesis in dec:
+            fout.write(hypothesis + "\n")
+            fout.flush()
+    fout.close()
+    runtime = int(time.time() - start_time)  # seconds
+    n_obs = len(examples)
+    return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
+
+
+def datetime_now():
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def run_generate(verbose=True):
+    """
+
+    Takes input text, generates output, and then using reference calculates the BLEU scores.
+
+    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.
+
+    Args:
+        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout
+
+    Returns:
+        a tuple: ``(scores, params}``
+        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
+        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,t5-base, etc.")
+    parser.add_argument("input_path", type=str, help="like cnn_dm/test.source")
+    parser.add_argument("save_path", type=str, help="where to save summaries")
+    parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test.target")
+    parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
+    parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
+    parser.add_argument(
+        "--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples"
+    )
+    parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
+    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
+    parser.add_argument(
+        "--n_obs", type=int, default=-1, required=False, help="How many observations. Defaults to all."
+    )
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results")
+    parser.add_argument(
+        "--info",
+        nargs="?",
+        type=str,
+        const=datetime_now(),
+        help="use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.",
+    )
+    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
+    args, rest = parser.parse_known_args()
+    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
+    if parsed_args and verbose:
+        print(f"parsed the following generate kwargs: {parsed_args}")
+    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path).readlines()]
+    if args.n_obs > 0:
+        examples = examples[: args.n_obs]
+    Path(args.save_path).parent.mkdir(exist_ok=True)
+    if args.reference_path is None and Path(args.score_path).exists():
+        warnings.warn(f"score_path {args.score_path} will be overwritten unless you type ctrl-c.")
+    runtime_metrics = generate_summaries_or_translations(
+        examples,
+        args.save_path,
+        args.model_name,
+        batch_size=args.bs,
+        device=args.device,
+        fp16=args.fp16,
+        task=args.task,
+        prefix=args.prefix,
+        **parsed_args,
+    )
+
+    if args.reference_path is None:
+        return {}
+
+    # Compute scores
+    score_fn = calculate_bleu if "translation" in args.task else calculate_rouge
+    output_lns = [x.rstrip() for x in open(args.save_path).readlines()]
+    reference_lns = [x.rstrip() for x in open(args.reference_path).readlines()][: len(output_lns)]
+    scores: dict = score_fn(output_lns, reference_lns)
+    scores.update(runtime_metrics)
+
+    if args.dump_args:
+        scores.update(parsed_args)
+    if args.info:
+        scores["info"] = args.info
+
+    if verbose:
+        print(scores)
+
+    if args.score_path is not None:
+        json.dump(scores, open(args.score_path, "w"))
+
+    return scores
+
+
+if __name__ == "__main__":
+    # Usage for MT:
+    # python run_eval.py MODEL_NAME $DATA_DIR/test.source $save_dir/test_translations.txt --reference_path $DATA_DIR/test.target --score_path $save_dir/test_bleu.json  --task translation $@
+    run_generate(verbose=True)