added sentence ranking task and loss (#809)

Summary: This task and loss are used for sentence ranking and multiple choice tasks such as RACE Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/809 Reviewed By: myleott Differential Revision: D16715745 Pulled By: jingfeidu fbshipit-source-id: cb4d1c7b26ebb3e2382449ba51af5745ef56f30f

added sentence ranking task and loss (#809)
Summary: This task and loss are used for sentence ranking and multiple choice tasks such as RACE Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/809 Reviewed By: myleott Differential Revision: D16715745 Pulled By: jingfeidu fbshipit-source-id: cb4d1c7b26ebb3e2382449ba51af5745ef56f30f
b6c55b62 · Jingfei Du · Facebook Github Bot · 838e108a · b6c55b62 · b6c55b62
Commit b6c55b62 authored Aug 09, 2019 by Jingfei Du Committed by Facebook Github Bot Aug 09, 2019
5 changed files
--- a/examples/roberta/README.finetune_race.md
+++ b/examples/roberta/README.finetune_race.md
+# Finetuning RoBERTa on RACE tasks
+
+### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/)
+
+### 2) Preprocess RACE data:
+```bash
+python ./examples/roberta/preprocess_RACE.py <input-dir> <extracted-data-dir>
+./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir>
+```
+
+### 3) Fine-tuning on RACE:
+
+```bash
+MAX_EPOCHS=5          # epoch number
+LR=1e-05              # Peak LR for fixed LR scheduler.
+NUM_CLASSES=4
+MAX_SENTENCES=2       # batch size
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+CUDA_VISIBLE_DEVICES=0 python train.py <race-preprocessed-dir>/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --max-sentences $MAX_SENTENCES \
+    --task sentence_ranking \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_ranking \
+    --num-classes $NUM_CLASSES \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler fixed --lr $LR \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --update-freq 8 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
+```
+
+**Note:**
+
+a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size.
+
+b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--max-sentences`.
+
+c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.  
--- a/examples/roberta/preprocess_RACE.py
+++ b/examples/roberta/preprocess_RACE.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import json
+import os
+
+
+class InputExample:
+    def __init__(self, paragraph, qa_list, label):
+        self.paragraph = paragraph
+        self.qa_list = qa_list
+        self.label = label
+
+
+def get_examples(data_dir, set_type):
+    """
+    Extract paragraph and question-answer list from each json file
+    """
+    examples = []
+
+    levels = ["middle", "high"]
+    set_type_c = set_type.split('-')
+    if len(set_type_c) == 2:
+        levels = [set_type_c[1]]
+        set_type = set_type_c[0]
+    for level in levels:
+        cur_dir = os.path.join(data_dir, set_type, level)
+        for filename in os.listdir(cur_dir):
+            cur_path = os.path.join(cur_dir, filename)
+            with open(cur_path, 'r') as f:
+                cur_data = json.load(f)
+                answers = cur_data["answers"]
+                options = cur_data["options"]
+                questions = cur_data["questions"]
+                context = cur_data["article"].replace("\n", " ")
+                for i in range(len(answers)):
+                    label = ord(answers[i]) - ord("A")
+                    qa_list = []
+                    question = questions[i]
+                    for j in range(4):
+                        option = options[i][j]
+                        if "_" in question:
+                            qa_cat = question.replace("_", option)
+                        else:
+                            qa_cat = " ".join([question, option])
+                        qa_list.append(qa_cat)
+                    examples.append(InputExample(context, qa_list, label))
+
+    return examples
+
+
+def main():
+    """
+    Helper script to extract paragraphs questions and answers from RACE datasets.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        help='input directory for downloaded RACE dataset',
+    )
+    parser.add_argument(
+        "--output-dir",
+        help='output directory for extracted data',
+    )
+    args = parser.parse_args()
+
+    for set_type in ["train", "dev", "test-middle", "test-high"]:
+        examples = get_examples(args.input_dir, set_type)
+        qa_file_paths = [args.output_dir + set_type + ".input" + str(i + 1) for i in range(4)]
+        qa_files = [open(qa_file_path, 'w') for qa_file_path in qa_file_paths]
+        outf_context_path = args.output_dir + set_type + ".input0"
+        outf_label_path = args.output_dir + set_type + ".label"
+        outf_context = open(outf_context_path, 'w')
+        outf_label = open(outf_label_path, 'w')
+        for example in examples:
+            outf_context.write(example.paragraph + '\n')
+            for i in range(4):
+                qa_files[i].write(example.qa_list[i] + '\n')
+            outf_label.write(str(example.label) + '\n')
+        
+        for f in qa_files:
+            f.close()
+        outf_label.close()
+        outf_context.close()
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/roberta/preprocess_RACE.sh
+++ b/examples/roberta/preprocess_RACE.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# data should be downloaded and processed with reprocess_RACE.py
+if [[ $# -ne 2 ]]; then
+  echo "Run as following:"
+  echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>"
+  exit 1
+fi
+
+RACE_DATA_FOLDER=$1
+OUT_DATA_FOLDER=$2
+
+# download bpe encoder.json, vocabulary and fairseq dictionary
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+
+SPLITS="train dev test-middle test-high"
+INPUT_TYPES="input0 input1 input2 input3 input4"
+for INPUT_TYPE in $INPUT_TYPES
+do
+  for SPLIT in $SPLITS
+      do
+      echo "BPE encoding $SPLIT/$INPUT_TYPE"
+      python -m examples.roberta.multiprocessing_bpe_encoder \
+            --encoder-json encoder.json \
+            --vocab-bpe vocab.bpe \
+            --inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \
+            --outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \
+            --workers 10 \
+            --keep-empty;
+
+      done
+done
+
+for INPUT_TYPE in $INPUT_TYPES
+    do
+      LANG="input$INPUT_TYPE"
+      fairseq-preprocess \
+        --dataset-impl cached \
+        --only-source \
+        --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \
+        --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \
+        --testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \
+        --destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \
+        --workers 10 \
+        --srcdict dict.txt;
+done
+
+rm -rf "$OUT_DATA_FOLDER/label"
+mkdir -p "$OUT_DATA_FOLDER/label"
+cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/"
+cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label"
+cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label"
+cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"
\ No newline at end of file
--- a/fairseq/criterions/sentence_ranking.py
+++ b/fairseq/criterions/sentence_ranking.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from fairseq import utils
+
+from . import FairseqCriterion, register_criterion
+
+
+@register_criterion('sentence_ranking')
+class SentenceRankingCriterion(FairseqCriterion):
+
+    def forward(self, model, sample, reduce=True):
+        """Compute ranking loss for the given sample.
+
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        scores = []
+        for idx in range(self.args.num_classes):
+            score, _ = model(
+                **sample['net_input{idx}'.format(idx=idx+1)],
+                features_only=True,
+                classification_head_name='sentence_classification_head',
+            )
+            scores.append(score)
+
+        logits = torch.cat(scores, dim=1)
+        targets = model.get_targets(sample, [logits]).view(-1)
+        sample_size = targets.numel()
+
+        loss = F.nll_loss(
+            F.log_softmax(logits, dim=-1, dtype=torch.float32),
+            targets,
+            reduction='sum',
+        )
+
+        logging_output = {
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'ntokens': sample['ntokens'],
+            'nsentences': sample_size,
+            'sample_size': sample_size,
+        }
+        logging_output.update(
+            ncorrect=(logits.max(dim=1)[1] == targets).sum().item()
+        )
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
+        ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
+        nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
+        sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
+
+        agg_output = {
+            'loss': loss_sum / sample_size / math.log(2),
+            'ntokens': ntokens,
+            'nsentences': nsentences,
+            'sample_size': sample_size,
+        }
+
+        if len(logging_outputs) > 0 and 'ncorrect' in logging_outputs[0]:
+            ncorrect = sum(log.get('ncorrect', 0) for log in logging_outputs)
+            agg_output.update(accuracy=ncorrect/nsentences)
+
+        if sample_size != ntokens:
+            agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
+        return agg_output
--- a/fairseq/tasks/sentence_ranking.py
+++ b/fairseq/tasks/sentence_ranking.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import numpy as np
+
+from fairseq.data import (
+    ConcatSentencesDataset,
+    data_utils,
+    Dictionary,
+    IdDataset,
+    NestedDictionaryDataset,
+    NumSamplesDataset,
+    NumelDataset,
+    OffsetTokensDataset,
+    PrependTokenDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    TruncateDataset,
+)
+
+from . import FairseqTask, register_task
+
+
+@register_task('sentence_ranking')
+class SentenceRankingTask(FairseqTask):
+    """
+    Ranking task on multiple sentences.
+
+    Args:
+        dictionary (Dictionary): the dictionary for the input of the task
+    """
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument('data', metavar='FILE',
+                            help='file prefix for data')
+        parser.add_argument('--num-classes', type=int, default=2,
+                            help='number of sentences to be ranked')
+        parser.add_argument('--init-token', type=int, default=None,
+                            help='add token at the beginning of each batch item')
+        parser.add_argument('--separator-token', type=int, default=None,
+                            help='add separator token between inputs')
+        parser.add_argument('--no-shuffle', action='store_true', default=False)
+        parser.add_argument('--truncate-sequence', action='store_true', default=False,
+                            help='Truncate sequence to max_sequence_length')
+        parser.add_argument('--max-option-length', type=int, default=None,
+                            help='max length for each option')
+
+    def __init__(self, args, dictionary):
+        super().__init__(args)
+        self.dictionary = dictionary
+
+    @classmethod
+    def load_dictionary(cls, args, filename, source=True):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol('<mask>')
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == 'sentence_ranking', \
+            'Must set --criterion=sentence_ranking'
+
+        args.tokens_per_sample = args.max_positions
+
+        # load data dictionary
+        data_dict = cls.load_dictionary(
+            args,
+            os.path.join(args.data, 'input0', 'dict.txt'),
+            source=True,
+        )
+        print('| [input] dictionary: {} types'.format(len(data_dict)))
+        return SentenceRankingTask(args, data_dict)
+
+    def load_dataset(self, split, combine=False, **kwargs):
+        """Load a given dataset split (e.g., train, valid, test)."""
+
+        def get_path(type, split):
+            return os.path.join(self.args.data, type, split)
+
+        def make_dataset(type, dictionary):
+            split_path = get_path(type, split)
+
+            dataset = data_utils.load_indexed_dataset(
+                split_path,
+                self.source_dictionary,
+                self.args.dataset_impl,
+                combine=combine,
+            )
+            return dataset
+
+        input0 = make_dataset('input0', self.source_dictionary)
+        input_options = [
+            make_dataset(
+                'input{idx}'.format(idx=idx + 1),
+                self.source_dictionary
+            )
+            for idx in range(self.args.num_classes)
+        ]
+
+        if self.args.separator_token is not None:
+            input0 = PrependTokenDataset(input0, self.args.separator_token)
+
+        src_tokens = []
+        for input_option in input_options:
+            if self.args.init_token is not None:
+                input_option = PrependTokenDataset(input_option, self.args.init_token)
+            input_option = TruncateDataset(input_option, self.args.max_option_length)
+            src_token = ConcatSentencesDataset(input_option, input0)
+            if self.args.truncate_sequence:
+                src_token = TruncateDataset(src_token, self.args.max_positions)
+            src_tokens.append(src_token)
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(src_tokens[0]))
+
+        dataset = {
+            'id': IdDataset(),
+            'nsentences': NumSamplesDataset(),
+            'ntokens': NumelDataset(src_tokens[0], reduce=True),
+        }
+
+        for src_token_idx in range(len(src_tokens)):
+            dataset.update(
+                {
+                    'net_input{idx}'.format(idx=src_token_idx+1): {
+                        'src_tokens': RightPadDataset(
+                            src_tokens[src_token_idx],
+                            pad_idx=self.source_dictionary.pad(),
+                        ),
+                        'src_lengths': NumelDataset(src_tokens[src_token_idx], reduce=False),
+                    }
+                }
+            )
+
+        label_path = f"{get_path('label', split)}.label"
+        if os.path.exists(label_path):
+            dataset.update(
+                target=RawLabelDataset([
+                    int(x.strip()) for x in open(label_path).readlines()
+                ])
+            )
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+
+        if self.args.no_shuffle:
+            dataset = nested_dataset
+        else:
+            dataset = SortDataset(
+                nested_dataset,
+                # shuffle
+                sort_order=[shuffle],
+            )
+
+        print("| Loaded {0} with #samples: {1}".format(split, len(dataset)))
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+
+        model.register_classification_head(
+            'sentence_classification_head',
+            num_classes=1,
+        )
+
+        return model
+
+    def max_positions(self):
+        return self.args.max_positions
+
+    @property
+    def source_dictionary(self):
+        return self.dictionary
+
+    @property
+    def target_dictionary(self):
+        return self.dictionary