Merge remote-tracking branch 'refs/remotes/huggingface/master'

40ed7172 · erenup · 86a63070 · 7296f101 · 40ed7172 · 40ed7172
Commit 40ed7172 authored Dec 13, 2019 by erenup
20 changed files
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -43,7 +43,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                  XLNetTokenizer, RobertaConfig,
                                  RobertaForMultipleChoice, RobertaTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup

 from utils_multiple_choice import (convert_examples_to_features, processors)

@@ -101,7 +101,7 @@ def train(args, train_dataset, model, tokenizer):
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -226,9 +226,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False):

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+        # multi-gpu evaluate
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
@@ -464,9 +468,17 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          num_labels=num_labels,
+                                          finetuning_task=args.task_name,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert). """
+""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """

 from __future__ import absolute_import, division, print_function

@@ -33,17 +33,23 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file

-from transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
+from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
+from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
+from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer

 logger = logging.getLogger(__name__)

 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, )),
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
    ())

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
 }


@@ -78,7 +84,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
@@ -119,9 +125,10 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
-                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
-                      # XLM and RoBERTa don"t use segment_ids
                      "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
+
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

@@ -133,13 +140,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
@@ -148,7 +158,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id)
+                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -186,6 +196,10 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Num examples = %d", len(eval_dataset))
@@ -201,12 +215,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
-                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
-                      # XLM and RoBERTa don"t use segment_ids
                      "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

+            if args.n_gpu > 1:
+                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
+
            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
@@ -420,11 +437,15 @@ def main():
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels)
+                                          num_labels=num_labels,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-                                                do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path),
-                                        config=config)
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool(".ckpt" in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -508,3 +529,4 @@ def main():

 if __name__ == "__main__":
    main()
+
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
 import os
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
+from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors

 # script parameters
 BATCH_SIZE = 32
 EVAL_BATCH_SIZE = BATCH_SIZE * 2
 USE_XLA = False
 USE_AMP = False
+EPOCHS = 3
+
+TASK = "mrpc"
+
+if TASK == "sst-2":
+    TFDS_TASK = "sst2"
+elif TASK == "sts-b":
+    TFDS_TASK = "stsb"
+else: 
+    TFDS_TASK = TASK
+
+num_labels = len(glue_processors[TASK]().get_labels())
+print(num_labels)

 tf.config.optimizer.set_jit(USE_XLA)
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

-# Load tokenizer and model from pretrained model/vocabulary
+# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
+config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)

 # Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load('glue/mrpc', with_info=True)
+data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
 train_examples = info.splits['train'].num_examples
+
+# MNLI expects either validation_matched or validation_mismatched
 valid_examples = info.splits['validation'].num_examples

 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
+
+# MNLI expects either validation_matched or validation_mismatched
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
 train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

@@ -32,7 +50,13 @@ opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+
+if num_labels == 1:
+    loss = tf.keras.losses.MeanSquaredError()
+else:
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=opt, loss=loss, metrics=[metric])

@@ -40,24 +64,30 @@ model.compile(optimizer=opt, loss=loss, metrics=[metric])
 train_steps = train_examples//BATCH_SIZE
 valid_steps = valid_examples//EVAL_BATCH_SIZE

-history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
                    validation_data=valid_dataset, validation_steps=valid_steps)

 # Save TF2 model
 os.makedirs('./save/', exist_ok=True)
 model.save_pretrained('./save/')

-# Load the TensorFlow model in PyTorch for inspection
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+if TASK == "mrpc":
+    # Load the TensorFlow model in PyTorch for inspection
+    # This is to demo the interoperability between the two frameworks, you don't have to 
+    # do this in real life (you can run the inference on the TF model).
+    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+
+    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+    sentence_0 = 'This research was consistent with his findings.'
+    sentence_1 = 'His findings were compatible with this research.'
+    sentence_2 = 'His findings were not compatible with this research.'
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = 'This research was consistent with his findings.'
-sentence_1 = 'His findings were compatible with this research.'
-sentence_2 = 'His findings were not compatible with this research.'
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+    del inputs_1["special_tokens_mask"]
+    del inputs_2["special_tokens_mask"]

-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
-print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
+    pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+    pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+    print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
+    print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
+# Text Summarization with Pretrained Encoders
+
+This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
+
+The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
+
+The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
+
+## Setup
+
+```
+git clone https://github.com/huggingface/transformers && cd transformers
+pip install [--editable] .
+pip install nltk py-rouge
+cd examples/summarization
+```
+
+## Reproduce the authors' results on ROUGE
+
+To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+
+And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --to_cpu false \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+    --compute_rouge true
+```
+
+The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
+
+## Summarize any text
+
+Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --to_cpu false \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+```
+
+You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
--- a/examples/summarization/requirements.txt
+++ b/examples/summarization/requirements.txt
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
--- a/examples/summarization/utils_summarization_test.py
+++ b/examples/summarization/utils_summarization_test.py
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
--- a/examples/tests_samples/SQUAD/dev-v2.0-small.json
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
--- a/examples/tests_samples/SQUAD/train-v2.0.json
+++ b/examples/tests_samples/SQUAD/train-v2.0.json
--- a/examples/utils_squad_evaluate.py
+++ b/examples/utils_squad_evaluate.py
--- a/setup.py
+++ b/setup.py
--- a/templates/adding_a_new_example_script/README.md
+++ b/templates/adding_a_new_example_script/README.md