formating

1c933358 · thomwolf · e25b6fe3 · 1c933358 · 1c933358 · 1c933358
Commit 1c933358 authored Jan 06, 2020 by thomwolf
Showing with 358 additions and 240 deletions

examples/hans/hans_processors.py examples/hans/hans_processors.py +62 -51

examples/hans/test_hans.py examples/hans/test_hans.py +291 -186

examples/hans/utils_hans.py examples/hans/utils_hans.py +5 -3

No files found.
--- a/examples/hans/hans_processors.py
+++ b/examples/hans/hans_processors.py
@@ -18,8 +18,9 @@
 import logging
 import os
-from utils_hans import DataProcessor, InputExample, InputFeatures
 from transformers.file_utils import is_tf_available
+from utils_hans import DataProcessor, InputExample, InputFeatures
 if is_tf_available():
    import tensorflow as tf
@@ -27,15 +28,18 @@ if is_tf_available():
 logger = logging.getLogger(__name__)
-def hans_convert_examples_to_features(examples, tokenizer,
+def hans_convert_examples_to_features(
-                                      max_length=512,
+    examples,
-                                      task=None,
+    tokenizer,
-                                      label_list=None,
+    max_length=512,
-                                      output_mode=None,
+    task=None,
-                                      pad_on_left=False,
+    label_list=None,
-                                      pad_token=0,
+    output_mode=None,
-                                      pad_token_segment_id=0,
+    pad_on_left=False,
-                                      mask_padding_with_zero=True):
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
    """
    Loads a data file into a list of ``InputFeatures``
@@ -82,12 +86,7 @@ def hans_convert_examples_to_features(examples, tokenizer,
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)
-        inputs = tokenizer.encode_plus(
+        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
-            example.text_a,
-            example.text_b,
-            add_special_tokens=True,
-            max_length=max_length,
-        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
@@ -106,8 +105,12 @@ def hans_convert_examples_to_features(examples, tokenizer,
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
        if output_mode == "classification":
            label = label_map[example.label] if example.label in label_map else 0
@@ -128,28 +131,40 @@ def hans_convert_examples_to_features(examples, tokenizer,
            logger.info("label: %s (id = %d)" % (example.label, label))
        features.append(
-                InputFeatures(input_ids=input_ids,
+            InputFeatures(
-                              attention_mask=attention_mask,
+                input_ids=input_ids,
-                              token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
-                              label=label, pairID=pairID))
+                token_type_ids=token_type_ids,
+                label=label,
+                pairID=pairID,
+            )
+        )
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
-                yield  ({'input_ids': ex.input_ids,
+                yield (
-                         'attention_mask': ex.attention_mask,
+                    {
-                         'token_type_ids': ex.token_type_ids},
+                        "input_ids": ex.input_ids,
-                        ex.label)
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
-        return tf.data.Dataset.from_generator(gen,
+                    },
-            ({'input_ids': tf.int32,
+                    ex.label,
-              'attention_mask': tf.int32,
+                )
-              'token_type_ids': tf.int32},
-             tf.int64),
+        return tf.data.Dataset.from_generator(
-            ({'input_ids': tf.TensorShape([None]),
+            gen,
-              'attention_mask': tf.TensorShape([None]),
+            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
-              'token_type_ids': tf.TensorShape([None])},
+            (
-             tf.TensorShape([])))
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                tf.TensorShape([]),
+            ),
+        )
    return features
@@ -159,21 +174,20 @@ class HansProcessor(DataProcessor):
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
-        return InputExample(tensor_dict['idx'].numpy(),
+        return InputExample(
-                            tensor_dict['premise'].numpy().decode('utf-8'),
+            tensor_dict["idx"].numpy(),
-                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+            tensor_dict["premise"].numpy().decode("utf-8"),
-                            str(tensor_dict['label'].numpy()))
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
    def get_train_examples(self, data_dir):
        """See base class."""
-        return self._create_examples(
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
-            self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
    def get_dev_examples(self, data_dir):
        """See base class."""
-        return self._create_examples(
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
-            self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")),
-            "dev")
    def get_labels(self):
        """See base class."""
@@ -188,14 +202,12 @@ class HansProcessor(DataProcessor):
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[5]
            text_b = line[6]
-            pairID = line[7][2:] if line[7].startswith('ex') else line[7]
+            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
-            label = line[-1] 
+            label = line[-1]
-            examples.append(
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
-                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
        return examples
 glue_tasks_num_labels = {
    "hans": 3,
 }
@@ -207,4 +219,3 @@ glue_processors = {
 glue_output_modes = {
    "hans": "classification",
 }
--- a/examples/hans/test_hans.py
+++ b/examples/hans/test_hans.py
@@ -19,60 +19,72 @@ from __future__ import absolute_import, division, print_function
 import argparse
 import glob
+import json
 import logging
 import os
 import random
-import json
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+from hans_processors import glue_output_modes as output_modes
+from hans_processors import glue_processors as processors
+from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from transformers import glue_compute_metrics as compute_metrics
 try:
    from torch.utils.tensorboard import SummaryWriter
 except:
    from tensorboardX import SummaryWriter
-from tqdm import tqdm, trange
-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForSequenceClassification, BertTokenizer,
-                                  RobertaConfig,
-                                  RobertaForSequenceClassification,
-                                  RobertaTokenizer,
-                                  XLMConfig, XLMForSequenceClassification,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer,
-                                  DistilBertConfig,
-                                  DistilBertForSequenceClassification,
-                                  DistilBertTokenizer,
-                                  AlbertConfig,
-                                  AlbertForSequenceClassification, 
-                                  AlbertTokenizer,
-                                )
-from transformers import AdamW, get_linear_schedule_with_warmup
-from transformers import glue_compute_metrics as compute_metrics
-from hans_processors import glue_output_modes as output_modes
-from hans_processors import glue_processors as processors
-from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
 logger = logging.getLogger(__name__)
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
+ALL_MODELS = sum(
-                                                                                RobertaConfig, DistilBertConfig)), ())
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
 }
@@ -100,14 +112,19 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-        ]
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -121,17 +138,21 @@ def train(args, train_dataset, model, tokenizer):
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+        model = torch.nn.parallel.DistributedDataParallel(
-                                                          output_device=args.local_rank,
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-                                                          find_unused_parameters=True)
+        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+    logger.info(
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
@@ -145,16 +166,16 @@ def train(args, train_dataset, model, tokenizer):
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                      'attention_mask': batch[1],
+            if args.model_type != "distilbert":
-                      'labels':         batch[3]}
+                inputs["token_type_ids"] = (
-            if args.model_type != 'distilbert':
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
@@ -178,30 +199,34 @@ def train(args, train_dataset, model, tokenizer):
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            eval_key = 'eval_{}'.format(key)
+                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value
                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs['learning_rate'] = learning_rate_scalar
+                    logs["learning_rate"] = learning_rate_scalar
-                    logs['loss'] = loss_scalar
+                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss
                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
-                    #print(json.dumps({**logs, **{'step': global_step}}))
+                    # print(json.dumps({**logs, **{'step': global_step}}))
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
@@ -220,7 +245,7 @@ def train(args, train_dataset, model, tokenizer):
 def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
@@ -251,11 +276,11 @@ def evaluate(args, model, tokenizer, prefix=""):
            batch = tuple(t.to(args.device) for t in batch)
            with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                          'attention_mask': batch[1],
+                if args.model_type != "distilbert":
-                          'labels':         batch[3]}
+                    inputs["token_type_ids"] = (
-                if args.model_type != 'distilbert':
+                        batch[2] if args.model_type in ["bert", "xlnet"] else None
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
@@ -263,11 +288,11 @@ def evaluate(args, model, tokenizer, prefix=""):
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
                pair_ids = batch[4].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
                pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0)
        eval_loss = eval_loss / nb_eval_steps
@@ -280,7 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""):
        with open(output_eval_file, "w") as writer:
            writer.write("pairID,gld_label\n")
            for pid, pred in zip(pair_ids, preds):
-                writer.write('ex' + str(pid) + ',' + label_list[int(pred)] + '\n')
+                writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
    return results
@@ -292,11 +317,15 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+    cached_features_file = os.path.join(
-        'dev' if evaluate else 'train',
+        args.data_dir,
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        "cached_{}_{}_{}_{}".format(
-        str(args.max_seq_length),
+            "dev" if evaluate else "train",
-        str(task)))
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
    label_list = processor.get_labels()
@@ -305,18 +334,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1] 
+            label_list[1], label_list[2] = label_list[2], label_list[1]
-        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        examples = (
-        features = convert_examples_to_features(examples,
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-                                                tokenizer,
+        )
-                                                label_list=label_list,
+        features = convert_examples_to_features(
-                                                max_length=args.max_seq_length,
+            examples,
-                                                output_mode=output_mode,
+            tokenizer,
-                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            label_list=label_list,
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            max_length=args.max_seq_length,
-                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+            output_mode=output_mode,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -335,7 +367,6 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
    all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
    return dataset, label_list
@@ -344,90 +375,149 @@ def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
+    parser.add_argument(
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+        "--data_dir",
-    parser.add_argument("--model_type", default=None, type=str, required=True,
+        default=None,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+        type=str,
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+        required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    parser.add_argument("--task_name", default=None, type=str, required=True,
+    )
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument(
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
+        "--model_type",
-                        help="The output directory where the model predictions and checkpoints will be written.")
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
+    parser.add_argument(
-                        help="Pretrained config name or path if not the same as model_name")
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    parser.add_argument("--tokenizer_name", default="", type=str,
+    )
-                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument(
-    parser.add_argument("--cache_dir", default="", type=str,
+        "--tokenizer_name",
-                        help="Where do you want to store the pre-trained models downloaded from s3")
+        default="",
-    parser.add_argument("--max_seq_length", default=128, type=int,
+        type=str,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
+        help="Pretrained tokenizer name or path if not the same as model_name",
-                             "than this will be truncated, sequences shorter will be padded.")
+    )
-    parser.add_argument("--do_train", action='store_true',
+    parser.add_argument(
-                        help="Whether to run training.")
+        "--cache_dir",
-    parser.add_argument("--do_eval", action='store_true',
+        default="",
-                        help="Whether to run eval on the dev set.")
+        type=str,
-    parser.add_argument("--evaluate_during_training", action='store_true',
+        help="Where do you want to store the pre-trained models downloaded from s3",
-                        help="Rul evaluation during training at each logging step.")
+    )
-    parser.add_argument("--do_lower_case", action='store_true',
+    parser.add_argument(
-                        help="Set this flag if you are using an uncased model.")
+        "--max_seq_length",
+        default=128,
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+        type=int,
-                        help="Batch size per GPU/CPU for training.")
+        help="The maximum total input sequence length after tokenization. Sequences longer "
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+        "than this will be truncated, sequences shorter will be padded.",
-                        help="Batch size per GPU/CPU for evaluation.")
+    )
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")     
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
+    parser.add_argument(
-                        help="The initial learning rate for Adam.")
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    parser.add_argument("--weight_decay", default=0.0, type=float,
+    )
-                        help="Weight decay if we apply some.")
+    parser.add_argument(
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-                        help="Epsilon for Adam optimizer.")
+    )
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+    parser.add_argument(
-                        help="Total number of training epochs to perform.")
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    parser.add_argument("--max_steps", default=-1, type=int,
+    )
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument(
-    parser.add_argument("--warmup_steps", default=0, type=int,
+        "--gradient_accumulation_steps",
-                        help="Linear warmup over warmup_steps.")
+        type=int,
+        default=1,
-    parser.add_argument('--logging_steps', type=int, default=50,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
-                        help="Log every X updates steps.")
+    )
-    parser.add_argument('--save_steps', type=int, default=50,
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--no_cuda", action='store_true',
+    parser.add_argument(
-                        help="Avoid using CUDA when available")
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    parser.add_argument('--overwrite_output_dir', action='store_true',
+    )
-                        help="Overwrite the content of the output directory")
+    parser.add_argument(
-    parser.add_argument('--overwrite_cache', action='store_true',
+        "--max_steps",
-                        help="Overwrite the cached training and evaluation sets")
+        default=-1,
-    parser.add_argument('--seed', type=int, default=42,
+        type=int,
-                        help="random seed for initialization")
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
-    parser.add_argument('--fp16', action='store_true',
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument(
-    parser.add_argument("--local_rank", type=int, default=-1,
+        "--eval_all_checkpoints",
-                        help="For distributed training: local_rank")
+        action="store_true",
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+    if (
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -439,16 +529,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+    logging.basicConfig(
-                        datefmt = '%m/%d/%Y %H:%M:%S',
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+        datefmt="%m/%d/%Y %H:%M:%S",
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
    # Set seed
    set_seed(args)
@@ -468,17 +566,23 @@ def main():
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+    config = config_class.from_pretrained(
-                                          num_labels=num_labels,
+        args.config_name if args.config_name else args.model_name_or_path,
-                                          finetuning_task=args.task_name,
+        num_labels=num_labels,
-                                          cache_dir=args.cache_dir if args.cache_dir else None)
+        finetuning_task=args.task_name,
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
-                                                do_lower_case=args.do_lower_case,
+    )
-                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(
-    model = model_class.from_pretrained(args.model_name_or_path,
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+        do_lower_case=args.do_lower_case,
-                                        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
-                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -487,14 +591,12 @@ def main():
    logger.info("Training/evaluation parameters %s", args)
    # Training
    if args.do_train:
        train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -504,36 +606,39 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    return results

--- a/examples/hans/utils_hans.py
+++ b/examples/hans/utils_hans.py
@@ -14,10 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import csv
-import sys
 import copy
+import csv
 import json
+import sys
 class InputExample(object):
    """
@@ -32,6 +33,7 @@ class InputExample(object):
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
        self.guid = guid
        self.text_a = text_a
@@ -117,6 +119,6 @@ class DataProcessor(object):
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+                    line = list(unicode(cell, "utf-8") for cell in line)
                lines.append(line)
            return lines