Merge branch 'master' into add_models_special_tokens_to_specific_configs

146c5212 · Lysandre Debut · GitHub · f5b50c6b · b623ddc0 · 146c5212
Unverified Commit 146c5212 authored Mar 05, 2020 by Lysandre Debut Committed by GitHub Mar 05, 2020
20 changed files
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -33,6 +33,9 @@ from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
+    AlbertConfig,
+    AlbertForTokenClassification,
+    AlbertTokenizer,
    BertConfig,
    BertForTokenClassification,
    BertTokenizer,
@@ -70,6 +73,7 @@ ALL_MODELS = sum(
 )

 MODEL_CLASSES = {
+    "albert": (AlbertConfig, AlbertForTokenClassification, AlbertTokenizer),
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
@@ -77,6 +81,8 @@ MODEL_CLASSES = {
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
 }

+TOKENIZER_ARGS = ["do_lower_case", "strip_accents", "keep_accents", "use_fast"]
+

 def set_seed(args):
    random.seed(args.seed)
@@ -462,7 +468,13 @@ def main():
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )
-
+    parser.add_argument(
+        "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents."
+    )
+    parser.add_argument(
+        "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents."
+    )
+    parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
@@ -545,7 +557,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
@@ -590,10 +602,12 @@ def main():
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
+    tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS}
+    logger.info("Tokenizer arguments: %s", tokenizer_args)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
+        **tokenizer_args,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
@@ -636,7 +650,7 @@ def main():
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
@@ -658,7 +672,7 @@ def main():
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")

--- a/examples/ner/run_pl.sh
+++ b/examples/ner/run_pl.sh
-# Require pytorch-lightning=0.6
+# Install newest ptl.
+pip install -U git+http://github.com/PyTorchLightning/pytorch-lightning/
+
+
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
+ wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
 export MAX_LENGTH=128
 export BERT_MODEL=bert-base-multilingual-cased
+python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
+python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
+python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
+cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
 export OUTPUT_DIR=germeval-model
 export BATCH_SIZE=32
 export NUM_EPOCHS=3

--- a/examples/ner/run_pl_ner.py
+++ b/examples/ner/run_pl_ner.py
@@ -7,8 +7,7 @@ import numpy as np
 import torch
 from seqeval.metrics import f1_score, precision_score, recall_score
 from torch.nn import CrossEntropyLoss
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader, TensorDataset

 from transformer_base import BaseTransformer, add_generic_args, generic_train
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
@@ -25,13 +24,14 @@ class NERTransformer(BaseTransformer):
    def __init__(self, hparams):
        self.labels = get_labels(hparams.labels)
        num_labels = len(self.labels)
+        self.pad_token_label_id = CrossEntropyLoss().ignore_index
        super(NERTransformer, self).__init__(hparams, num_labels)

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_num):
-        "Compute loss"
+        "Compute loss and log."
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
        if self.hparams.model_type != "distilbert":
            inputs["token_type_ids"] = (
@@ -40,25 +40,61 @@ class NERTransformer(BaseTransformer):

        outputs = self.forward(**inputs)
        loss = outputs[0]
-
        tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
        return {"loss": loss, "log": tensorboard_logs}

+    def _feature_file(self, mode):
+        return os.path.join(
+            self.hparams.data_dir,
+            "cached_{}_{}_{}".format(
+                mode,
+                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
+                str(self.hparams.max_seq_length),
+            ),
+        )
+
+    def prepare_data(self):
+        "Called to initialize data. Use the call to construct features"
+        args = self.hparams
+        for mode in ["train", "dev", "test"]:
+            cached_features_file = self._feature_file(mode)
+            if not os.path.exists(cached_features_file):
+                logger.info("Creating features from dataset file at %s", args.data_dir)
+                examples = read_examples_from_file(args.data_dir, mode)
+                features = convert_examples_to_features(
+                    examples,
+                    self.labels,
+                    args.max_seq_length,
+                    self.tokenizer,
+                    cls_token_at_end=bool(args.model_type in ["xlnet"]),
+                    cls_token=self.tokenizer.cls_token,
+                    cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+                    sep_token=self.tokenizer.sep_token,
+                    sep_token_extra=bool(args.model_type in ["roberta"]),
+                    pad_on_left=bool(args.model_type in ["xlnet"]),
+                    pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
+                    pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+                    pad_token_label_id=self.pad_token_label_id,
+                )
+                logger.info("Saving features into cached file %s", cached_features_file)
+                torch.save(features, cached_features_file)
+
    def load_dataset(self, mode, batch_size):
-        labels = get_labels(self.hparams.labels)
-        self.pad_token_label_id = CrossEntropyLoss().ignore_index
-        dataset = self.load_and_cache_examples(labels, self.pad_token_label_id, mode)
-        if mode == "train":
-            if self.hparams.n_gpu > 1:
-                sampler = DistributedSampler(dataset)
-            else:
-                sampler = RandomSampler(dataset)
-        else:
-            sampler = SequentialSampler(dataset)
-        dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
-        return dataloader
+        "Load datasets. Called after prepare data."
+        cached_features_file = self._feature_file(mode)
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+        return DataLoader(
+            TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids), batch_size=batch_size
+        )

    def validation_step(self, batch, batch_nb):
+        "Compute validation"
+
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
        if self.hparams.model_type != "distilbert":
            inputs["token_type_ids"] = (
@@ -68,11 +104,10 @@ class NERTransformer(BaseTransformer):
        tmp_eval_loss, logits = outputs[:2]
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs["labels"].detach().cpu().numpy()
-
-        return {"val_loss": tmp_eval_loss, "pred": preds, "target": out_label_ids}
+        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}

    def _eval_end(self, outputs):
-        "Task specific validation"
+        "Evaluation called for both Val and Test"
        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
        preds = np.argmax(preds, axis=2)
@@ -96,7 +131,6 @@ class NERTransformer(BaseTransformer):
        }

        if self.is_logger():
-            logger.info(self.proc_rank)
            logger.info("***** Eval results *****")
            for key in sorted(results.keys()):
                logger.info("  %s = %s", key, str(results[key]))
@@ -140,56 +174,6 @@ class NERTransformer(BaseTransformer):
                            )
        return ret

-    def load_and_cache_examples(self, labels, pad_token_label_id, mode):
-        args = self.hparams
-        tokenizer = self.tokenizer
-        if self.proc_rank not in [-1, 0] and mode == "train":
-            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        # Load data features from cache or dataset file
-        cached_features_file = os.path.join(
-            args.data_dir,
-            "cached_{}_{}_{}".format(
-                mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
-            ),
-        )
-        if os.path.exists(cached_features_file) and not args.overwrite_cache:
-            logger.info("Loading features from cached file %s", cached_features_file)
-            features = torch.load(cached_features_file)
-        else:
-            logger.info("Creating features from dataset file at %s", args.data_dir)
-            examples = read_examples_from_file(args.data_dir, mode)
-            features = convert_examples_to_features(
-                examples,
-                labels,
-                args.max_seq_length,
-                tokenizer,
-                cls_token_at_end=bool(args.model_type in ["xlnet"]),
-                cls_token=tokenizer.cls_token,
-                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-                sep_token=tokenizer.sep_token,
-                sep_token_extra=bool(args.model_type in ["roberta"]),
-                pad_on_left=bool(args.model_type in ["xlnet"]),
-                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-                pad_token_label_id=pad_token_label_id,
-            )
-            if self.proc_rank in [-1, 0]:
-                logger.info("Saving features into cached file %s", cached_features_file)
-                torch.save(features, cached_features_file)
-
-        if self.proc_rank == 0 and mode == "train":
-            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        # Convert to Tensors and build dataset
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
-
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        return dataset
-
    @staticmethod
    def add_model_specific_args(parser, root_dir):
        # Add NER specific options

--- a/examples/ner/transformer_base.py
+++ b/examples/ner/transformer_base.py
+import logging
 import os
 import random

@@ -26,6 +27,9 @@ from transformers import (
 )


+logger = logging.getLogger(__name__)
+
+
 ALL_MODELS = sum(
    (
        tuple(conf.pretrained_config_archive_map.keys())
@@ -77,20 +81,14 @@ class BaseTransformer(pl.LightningModule):
            cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None,
        )
        self.config, self.tokenizer, self.model = config, tokenizer, model
-        self.proc_rank = -1

    def is_logger(self):
-        return self.proc_rank <= 0
+        return self.trainer.proc_rank <= 0

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"
-        model = self.model

-        t_total = (
-            len(self.train_dataloader())
-            // self.hparams.gradient_accumulation_steps
-            * float(self.hparams.num_train_epochs)
-        )
+        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
@@ -103,18 +101,16 @@ class BaseTransformer(pl.LightningModule):
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
-        )
-        self.lr_scheduler = scheduler
+        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
-
-        # Step each time.
-        optimizer.step()
-        self.lr_scheduler.step()
+        if self.trainer.use_tpu:
+            xm.optimizer_step(optimizer)
+        else:
+            optimizer.step()
        optimizer.zero_grad()
+        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
@@ -127,22 +123,27 @@ class BaseTransformer(pl.LightningModule):
    def test_end(self, outputs):
        return self.validation_end(outputs)

-    @pl.data_loader
    def train_dataloader(self):
-        return self.load_dataset("train", self.hparams.train_batch_size)
+        train_batch_size = self.hparams.train_batch_size
+        dataloader = self.load_dataset("train", train_batch_size)
+
+        t_total = (
+            (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams.n_gpu)))
+            // self.hparams.gradient_accumulation_steps
+            * float(self.hparams.num_train_epochs)
+        )
+        scheduler = get_linear_schedule_with_warmup(
+            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
+        )
+        self.lr_scheduler = scheduler
+        return dataloader

-    @pl.data_loader
    def val_dataloader(self):
        return self.load_dataset("dev", self.hparams.eval_batch_size)

-    @pl.data_loader
    def test_dataloader(self):
        return self.load_dataset("test", self.hparams.eval_batch_size)

-    def init_ddp_connection(self, proc_rank, world_size):
-        self.proc_rank = proc_rank
-        super(BaseTransformer, self).init_ddp_connection(proc_rank, world_size)
-
    @staticmethod
    def add_model_specific_args(parser, root_dir):
        parser.add_argument(
@@ -213,6 +214,7 @@ def add_generic_args(parser, root_dir):
    )

    parser.add_argument("--n_gpu", type=int, default=1)
+    parser.add_argument("--n_tpu_cores", type=int, default=0)
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
@@ -252,13 +254,22 @@ def generic_train(model, args):
        accumulate_grad_batches=args.gradient_accumulation_steps,
        gpus=args.n_gpu,
        max_epochs=args.num_train_epochs,
+        early_stop_callback=False,
        gradient_clip_val=args.max_grad_norm,
        checkpoint_callback=checkpoint_callback,
    )
+
    if args.fp16:
        train_params["use_amp"] = args.fp16
        train_params["amp_level"] = args.fp16_opt_level

+    if args.n_tpu_cores > 0:
+        global xm
+        import torch_xla.core.xla_model as xm
+
+        train_params["num_tpu_cores"] = args.n_tpu_cores
+        train_params["gpus"] = 0
+
    if args.n_gpu > 1:
        train_params["distributed_backend"] = "ddp"


--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -338,7 +338,7 @@ def main():
    # Setup devices and distributed training
    if args.local_rank == -1 or args.no_cuda:
        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)

--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -189,7 +189,7 @@ def main():
    args = parser.parse_args()

    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = torch.cuda.device_count()
+    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

    set_seed(args)


--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -183,8 +183,11 @@ def train(args, train_dataset, model, tokenizer):
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
-        # set global_step to gobal_step of last saved checkpoint from model path
-        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        # set global_step to global_step of last saved checkpoint from model path
+        try:
+            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        except ValueError:
+            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

@@ -575,7 +578,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -663,7 +663,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -535,7 +535,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -725,7 +725,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -530,7 +530,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)

--- a/examples/summarization/__init__.py
+++ b/examples/summarization/__init__.py
--- a/examples/summarization/bart/README.md
+++ b/examples/summarization/bart/README.md
+### Get the CNN/Daily Mail Data
+To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+this should make a directory called cnn_dm/ with files like `test.source`. 
+To use your own data, copy that files format. Each article to be summarized is on its own line.
+
+### Usage
+To create summaries for each article in dataset, run:
+```bash
+python evaluate_cnn.py <path_to_test.source> cnn_test_summaries.txt
+```
+the default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system.
+
+### Where is the code?
+The core model is in `src/transformers/modeling_bart.py`. This directory only contains examples.
+
+### (WIP) Rouge Scores
+
+### Stanford CoreNLP Setup
+```
+ptb_tokenize () {
+    cat $1 | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > $2
+}
+
+sudo apt install openjdk-8-jre-headless
+sudo apt-get install ant
+wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
+unzip stanford-corenlp-full-2018-10-05.zip
+cd stanford-corenlp-full-2018-10-05
+export CLASSPATH=stanford-corenlp-3.9.2.jar:stanford-corenlp-3.9.2-models.jar
+```
+### Rouge Setup
+Install `files2rouge` following the instructions at [here](https://github.com/pltrdy/files2rouge).
+I also needed to run `sudo apt-get install libxml-parser-perl`
+
+```python
+from files2rouge import files2rouge
+from files2rouge import settings
+files2rouge.run(<path_to_tokenized_hypo>,
+                <path_to_tokenized_target>,
+               saveto='rouge_output.txt')
+```
--- a/examples/summarization/bart/__init__.py
+++ b/examples/summarization/bart/__init__.py
--- a/examples/summarization/bart/evaluate_cnn.py
+++ b/examples/summarization/bart/evaluate_cnn.py
+import argparse
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from transformers import BartForMaskedLM, BartTokenizer
+
+
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
+    fout = Path(out_file).open("w")
+    model = BartForMaskedLM.from_pretrained("bart-large-cnn", output_past=True,)
+    tokenizer = BartTokenizer.from_pretrained("bart-large")
+    for batch in tqdm(list(chunks(lns, batch_size))):
+        dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True)
+        summaries = model.generate(
+            input_ids=dct["input_ids"].to(device),
+            attention_mask=dct["attention_mask"].to(device),
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=140,
+            min_len=55,
+            no_repeat_ngram_size=3,
+        )
+        dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
+        for hypothesis in dec:
+            fout.write(hypothesis + "\n")
+            fout.flush()
+
+
+def _run_generate():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "source_path", type=str, help="like cnn_dm/test.source",
+    )
+    parser.add_argument(
+        "output_path", type=str, help="where to save summaries",
+    )
+    parser.add_argument(
+        "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",
+    )
+    parser.add_argument(
+        "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
+    )
+    args = parser.parse_args()
+    lns = [" " + x.rstrip() for x in open(args.source_path).readlines()]
+    generate_summaries(lns, args.output_path, batch_size=args.bs, device=args.device)
+
+
+if __name__ == "__main__":
+    _run_generate()
--- a/examples/summarization/bart/test_bart_examples.py
+++ b/examples/summarization/bart/test_bart_examples.py
+import logging
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+from .evaluate_cnn import _run_generate
+
+
+articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+class TestBartExamples(unittest.TestCase):
+    def test_bart_cnn_cli(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+        tmp = Path(tempfile.gettempdir()) / "utest_generations.hypo"
+        with tmp.open("w") as f:
+            f.write("\n".join(articles))
+        testargs = ["evaluate_cnn.py", str(tmp), "output.txt"]
+        with patch.object(sys, "argv", testargs):
+            _run_generate()
+            self.assertTrue(Path("output.txt").exists())
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -15,7 +15,7 @@ pip install nltk py-rouge
 cd examples/summarization
 ```

-## Reproduce the authors' results on ROUGE
+## Reproduce the authors'  ROUGE score

 To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:


--- a/examples/summarization/bertabs/__init__.py
+++ b/examples/summarization/bertabs/__init__.py
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py