v1.0

799a38c5 · chenzk · 799a38c5 · 799a38c5 · 799a38c5 · 799a38c5
Commit 799a38c5 authored Nov 10, 2023 by chenzk
20 changed files
--- a/fairseq/examples/roberta/config/finetuning/mnli.yaml
+++ b/fairseq/examples/roberta/config/finetuning/mnli.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 3
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 7432
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 123873
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/finetuning/mrpc.yaml
+++ b/fairseq/examples/roberta/config/finetuning/mrpc.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 137
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 2296
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/finetuning/qnli.yaml
+++ b/fairseq/examples/roberta/config/finetuning/qnli.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 1986
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 33112
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/finetuning/qqp.yaml
+++ b/fairseq/examples/roberta/config/finetuning/qqp.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 28318
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 113272
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/finetuning/rte.yaml
+++ b/fairseq/examples/roberta/config/finetuning/rte.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 122
+
+optimization:
+  clip_norm: 0.0
+  lr: [2e-05]
+  max_update: 2036
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/finetuning/sst_2.yaml
+++ b/fairseq/examples/roberta/config/finetuning/sst_2.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 2
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  best_checkpoint_metric: accuracy
+  maximize_best_checkpoint_metric: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+
+dataset:
+  batch_size: 32
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 1256
+
+optimization:
+  clip_norm: 0.0
+  lr: [1e-05]
+  max_update: 20935
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/finetuning/sts_b.yaml
+++ b/fairseq/examples/roberta/config/finetuning/sts_b.yaml
+# @package _group_
+
+common:
+  fp16: true
+  fp16_init_scale: 4
+  threshold_loss_scale: 1
+  fp16_scale_window: 128
+  log_format: json
+  log_interval: 200
+
+task:
+  _name: sentence_prediction
+  data: ???
+  init_token: 0
+  separator_token: 2
+  num_classes: 1
+  max_positions: 512
+
+checkpoint:
+  restore_file: ???
+  reset_optimizer: true
+  reset_dataloader: true
+  reset_meters: true
+  no_epoch_checkpoints: true
+
+distributed_training:
+  find_unused_parameters: true
+  distributed_world_size: 1
+
+criterion:
+  _name: sentence_prediction
+  regression_target: true
+
+dataset:
+  batch_size: 16
+  required_batch_size_multiple: 1
+  max_tokens: 4400
+
+optimizer:
+  _name: adam
+  weight_decay: 0.1
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 214
+
+optimization:
+  clip_norm: 0.0
+  lr: [2e-05]
+  max_update: 3598
+  max_epoch: 10
+
+model:
+  _name: roberta
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/config/pretraining/base.yaml
+++ b/fairseq/examples/roberta/config/pretraining/base.yaml
+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+
+checkpoint:
+  no_epoch_checkpoints: true
+
+task:
+  _name: masked_lm
+  data: ???
+  sample_break_mode: complete
+  tokens_per_sample: 512
+
+criterion: masked_lm
+
+dataset:
+  batch_size: 16
+  ignore_unused_valid_subsets: true
+
+optimizer:
+  _name: adam
+  weight_decay: 0.01
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 10000
+
+optimization:
+  clip_norm: 0
+  lr: [0.0005]
+  max_update: 125000
+  update_freq: [16]
+
+model:
+  _name: roberta
+  max_positions: 512
+  dropout: 0.1
+  attention_dropout: 0.1
--- a/fairseq/examples/roberta/multiprocessing_bpe_encoder.py
+++ b/fairseq/examples/roberta/multiprocessing_bpe_encoder.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import contextlib
+import sys
+from collections import Counter
+from multiprocessing import Pool
+
+from fairseq.data.encoders.gpt2_bpe import get_encoder
+
+
+def main():
+    """
+    Helper script to encode raw text with the GPT-2 BPE using multiple processes.
+
+    The encoder.json and vocab.bpe files can be obtained here:
+    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
+    - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--encoder-json",
+        help="path to encoder.json",
+    )
+    parser.add_argument(
+        "--vocab-bpe",
+        type=str,
+        help="path to vocab.bpe",
+    )
+    parser.add_argument(
+        "--inputs",
+        nargs="+",
+        default=["-"],
+        help="input files to filter/encode",
+    )
+    parser.add_argument(
+        "--outputs",
+        nargs="+",
+        default=["-"],
+        help="path to save encoded outputs",
+    )
+    parser.add_argument(
+        "--keep-empty",
+        action="store_true",
+        help="keep empty lines",
+    )
+    parser.add_argument("--workers", type=int, default=20)
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(
+        args.outputs
+    ), "number of input and output paths should match"
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-"
+            else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-"
+            else sys.stdout
+            for output in args.outputs
+        ]
+
+        encoder = MultiprocessingEncoder(args)
+        pool = Pool(args.workers, initializer=encoder.initializer)
+        encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100)
+
+        stats = Counter()
+        for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
+            if filt == "PASS":
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(enc_line, file=output_h)
+            else:
+                stats["num_filtered_" + filt] += 1
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        for k, v in stats.most_common():
+            print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
+
+
+class MultiprocessingEncoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        global bpe
+        bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
+
+    def encode(self, line):
+        global bpe
+        ids = bpe.encode(line)
+        return list(map(str, ids))
+
+    def decode(self, tokens):
+        global bpe
+        return bpe.decode(tokens)
+
+    def encode_lines(self, lines):
+        """
+        Encode a set of lines. All lines will be encoded together.
+        """
+        enc_lines = []
+        for line in lines:
+            line = line.strip()
+            if len(line) == 0 and not self.args.keep_empty:
+                return ["EMPTY", None]
+            tokens = self.encode(line)
+            enc_lines.append(" ".join(tokens))
+        return ["PASS", enc_lines]
+
+    def decode_lines(self, lines):
+        dec_lines = []
+        for line in lines:
+            tokens = map(int, line.strip().split())
+            dec_lines.append(self.decode(tokens))
+        return ["PASS", dec_lines]
+
+
+if __name__ == "__main__":
+    main()
--- a/fairseq/examples/roberta/preprocess_GLUE_tasks.sh
+++ b/fairseq/examples/roberta/preprocess_GLUE_tasks.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+if [[ $# -ne 2 ]]; then
+  echo "Run as following:"
+  echo "./examples/roberta/preprocess_GLUE_tasks.sh <glud_data_folder> <task_name>"
+  exit 1
+fi
+
+GLUE_DATA_FOLDER=$1
+
+# download bpe encoder.json, vocabulary and fairseq dictionary
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+
+TASKS=$2 # QQP
+
+if [ "$TASKS" = "ALL" ]
+then
+  TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA"
+fi
+
+for TASK in $TASKS
+do
+  echo "Preprocessing $TASK"
+
+  TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK"
+  echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER"
+
+  SPLITS="train dev test"
+  INPUT_COUNT=2
+  if [ "$TASK" = "QQP" ]
+  then
+    INPUT_COLUMNS=( 4 5 )
+    TEST_INPUT_COLUMNS=( 2 3 )
+    LABEL_COLUMN=6
+  elif [ "$TASK" = "MNLI" ]
+  then
+    SPLITS="train dev_matched dev_mismatched test_matched test_mismatched"
+    INPUT_COLUMNS=( 9 10 )
+    TEST_INPUT_COLUMNS=( 9 10 )
+    DEV_LABEL_COLUMN=16
+    LABEL_COLUMN=12
+  elif [ "$TASK" = "QNLI" ]
+  then
+    INPUT_COLUMNS=( 2 3 )
+    TEST_INPUT_COLUMNS=( 2 3 )
+    LABEL_COLUMN=4
+  elif [ "$TASK" = "MRPC" ]
+  then
+    INPUT_COLUMNS=( 4 5 )
+    TEST_INPUT_COLUMNS=( 4 5 )
+    LABEL_COLUMN=1
+  elif [ "$TASK" = "RTE" ]
+  then
+    INPUT_COLUMNS=( 2 3 )
+    TEST_INPUT_COLUMNS=( 2 3 )
+    LABEL_COLUMN=4
+  elif [ "$TASK" = "STS-B" ]
+  then
+    INPUT_COLUMNS=( 8 9 )
+    TEST_INPUT_COLUMNS=( 8 9 )
+    LABEL_COLUMN=10
+  # Following are single sentence tasks.
+  elif [ "$TASK" = "SST-2" ]
+  then
+    INPUT_COLUMNS=( 1 )
+    TEST_INPUT_COLUMNS=( 2 )
+    LABEL_COLUMN=2
+    INPUT_COUNT=1
+  elif [ "$TASK" = "CoLA" ]
+  then
+    INPUT_COLUMNS=( 4 )
+    TEST_INPUT_COLUMNS=( 2 )
+    LABEL_COLUMN=2
+    INPUT_COUNT=1
+  fi
+
+  # Strip out header and filter lines that don't have expected number of fields.
+  rm -rf "$TASK_DATA_FOLDER/processed"
+  mkdir -p "$TASK_DATA_FOLDER/processed"
+  for SPLIT in $SPLITS
+  do
+    # CoLA train and dev doesn't have header.
+    if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]]
+    then
+      cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
+    else
+      tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
+    fi
+
+    # Remove unformatted lines from train and dev files for QQP dataset.
+    if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]]
+    then
+      awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
+    else
+      cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
+    fi
+    rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
+  done
+
+  # Split into input0, input1 and label
+  for SPLIT in $SPLITS
+  do
+    for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
+    do
+      if [[ "$SPLIT" != test* ]]
+      then
+        COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]}
+      else
+        COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]}
+      fi
+      cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE";
+    done
+
+    if [[ "$SPLIT" != test* ]]
+    then
+      if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ]
+      then
+        cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"  > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
+      else
+        cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
+      fi
+    fi
+
+    # BPE encode.
+    for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
+    do
+      LANG="input$INPUT_TYPE"
+      echo "BPE encoding $SPLIT/$LANG"
+      python -m examples.roberta.multiprocessing_bpe_encoder \
+      --encoder-json encoder.json \
+      --vocab-bpe vocab.bpe \
+      --inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \
+      --outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \
+      --workers 60 \
+      --keep-empty;
+    done
+  done
+
+  # Remove output directory.
+  rm -rf "$TASK-bin"
+
+  DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG"
+  TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG"
+  if [ "$TASK" = "MNLI" ]
+  then
+    DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG"
+    TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG"
+  fi
+
+  # Run fairseq preprocessing:
+  for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
+  do
+    LANG="input$INPUT_TYPE"
+    fairseq-preprocess \
+      --only-source \
+      --trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \
+      --validpref "${DEVPREF//LANG/$LANG}" \
+      --testpref "${TESTPREF//LANG/$LANG}" \
+      --destdir "$TASK-bin/$LANG" \
+      --workers 60 \
+      --srcdict dict.txt;
+  done
+  if [[ "$TASK" !=  "STS-B" ]]
+  then
+    fairseq-preprocess \
+      --only-source \
+      --trainpref "$TASK_DATA_FOLDER/processed/train.label" \
+      --validpref "${DEVPREF//LANG/label}" \
+      --destdir "$TASK-bin/label" \
+      --workers 60;
+  else
+    # For STS-B output range is converted to be between: [0.0, 1.0]
+    mkdir -p "$TASK-bin/label"
+    awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label"
+    awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label"
+  fi
+done
--- a/fairseq/examples/roberta/preprocess_RACE.py
+++ b/fairseq/examples/roberta/preprocess_RACE.py
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+import os
+import re
+
+
+class InputExample:
+    def __init__(self, paragraph, qa_list, label):
+        self.paragraph = paragraph
+        self.qa_list = qa_list
+        self.label = label
+
+
+def get_examples(data_dir, set_type):
+    """
+    Extract paragraph and question-answer list from each json file
+    """
+    examples = []
+
+    levels = ["middle", "high"]
+    set_type_c = set_type.split("-")
+    if len(set_type_c) == 2:
+        levels = [set_type_c[1]]
+        set_type = set_type_c[0]
+    for level in levels:
+        cur_dir = os.path.join(data_dir, set_type, level)
+        for filename in os.listdir(cur_dir):
+            cur_path = os.path.join(cur_dir, filename)
+            with open(cur_path, "r") as f:
+                cur_data = json.load(f)
+                answers = cur_data["answers"]
+                options = cur_data["options"]
+                questions = cur_data["questions"]
+                context = cur_data["article"].replace("\n", " ")
+                context = re.sub(r"\s+", " ", context)
+                for i in range(len(answers)):
+                    label = ord(answers[i]) - ord("A")
+                    qa_list = []
+                    question = questions[i]
+                    for j in range(4):
+                        option = options[i][j]
+                        if "_" in question:
+                            qa_cat = question.replace("_", option)
+                        else:
+                            qa_cat = " ".join([question, option])
+                        qa_cat = re.sub(r"\s+", " ", qa_cat)
+                        qa_list.append(qa_cat)
+                    examples.append(InputExample(context, qa_list, label))
+
+    return examples
+
+
+def main():
+    """
+    Helper script to extract paragraphs questions and answers from RACE datasets.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        help="input directory for downloaded RACE dataset",
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="output directory for extracted data",
+    )
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    for set_type in ["train", "dev", "test-middle", "test-high"]:
+        examples = get_examples(args.input_dir, set_type)
+        qa_file_paths = [
+            os.path.join(args.output_dir, set_type + ".input" + str(i + 1))
+            for i in range(4)
+        ]
+        qa_files = [open(qa_file_path, "w") for qa_file_path in qa_file_paths]
+        outf_context_path = os.path.join(args.output_dir, set_type + ".input0")
+        outf_label_path = os.path.join(args.output_dir, set_type + ".label")
+        outf_context = open(outf_context_path, "w")
+        outf_label = open(outf_label_path, "w")
+        for example in examples:
+            outf_context.write(example.paragraph + "\n")
+            for i in range(4):
+                qa_files[i].write(example.qa_list[i] + "\n")
+            outf_label.write(str(example.label) + "\n")
+
+        for f in qa_files:
+            f.close()
+        outf_label.close()
+        outf_context.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/fairseq/examples/roberta/preprocess_RACE.sh
+++ b/fairseq/examples/roberta/preprocess_RACE.sh
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# data should be downloaded and processed with reprocess_RACE.py
+if [[ $# -ne 2 ]]; then
+  echo "Run as following:"
+  echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>"
+  exit 1
+fi
+
+RACE_DATA_FOLDER=$1
+OUT_DATA_FOLDER=$2
+
+# download bpe encoder.json, vocabulary and fairseq dictionary
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+
+SPLITS="train dev test-middle test-high"
+INPUT_TYPES="input0 input1 input2 input3 input4"
+for INPUT_TYPE in $INPUT_TYPES
+do
+  for SPLIT in $SPLITS
+      do
+      echo "BPE encoding $SPLIT/$INPUT_TYPE"
+      python -m examples.roberta.multiprocessing_bpe_encoder \
+            --encoder-json encoder.json \
+            --vocab-bpe vocab.bpe \
+            --inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \
+            --outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \
+            --workers 10 \
+            --keep-empty;
+
+      done
+done
+
+for INPUT_TYPE in $INPUT_TYPES
+    do
+      LANG="input$INPUT_TYPE"
+      fairseq-preprocess \
+        --only-source \
+        --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \
+        --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \
+        --testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \
+        --destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \
+        --workers 10 \
+        --srcdict dict.txt;
+done
+
+rm -rf "$OUT_DATA_FOLDER/label"
+mkdir -p "$OUT_DATA_FOLDER/label"
+cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/"
+cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label"
+cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label"
+cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"
--- a/fairseq/examples/roberta/wsc/README.md
+++ b/fairseq/examples/roberta/wsc/README.md
+# Finetuning RoBERTa on Winograd Schema Challenge (WSC) data
+
+The following instructions can be used to finetune RoBERTa on the WSC training
+data provided by [SuperGLUE](https://super.gluebenchmark.com/).
+
+Note that there is high variance in the results. For our GLUE/SuperGLUE
+submission we swept over the learning rate (1e-5, 2e-5, 3e-5), batch size (16,
+32, 64) and total number of updates (500, 1000, 2000, 3000), as well as the
+random seed. Out of ~100 runs we chose the best 7 models and ensembled them.
+
+**Approach:** The instructions below use a slightly different loss function than
+what's described in the original RoBERTa arXiv paper. In particular,
+[Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin
+ranking loss between `(query, candidate)` pairs with tunable hyperparameters
+alpha and beta. This is supported in our code as well with the `--wsc-alpha` and
+`--wsc-beta` arguments. However, we achieved slightly better (and more robust)
+results on the development set by instead using a single cross entropy loss term
+over the log-probabilities for the query and all mined candidates. **The
+candidates are mined using spaCy from each input sentence in isolation, so the
+approach remains strictly pointwise.** This reduces the number of
+hyperparameters and our best model achieved 92.3% development set accuracy,
+compared to ~90% accuracy for the margin loss. Later versions of the RoBERTa
+arXiv paper will describe this updated formulation.
+
+### 1) Download the WSC data from the SuperGLUE website:
+```bash
+wget https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip
+unzip WSC.zip
+
+# we also need to copy the RoBERTa dictionary into the same directory
+wget -O WSC/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+```
+
+### 2) Finetune over the provided training data:
+```bash
+TOTAL_NUM_UPDATES=2000  # Total number of training steps.
+WARMUP_UPDATES=250      # Linearly increase LR over this many steps.
+LR=2e-05                # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=16        # Batch size per GPU.
+SEED=1                  # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+# we use the --user-dir option to load the task and criterion
+# from the examples/roberta/wsc directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --valid-subset val \
+    --fp16 --ddp-backend legacy_ddp \
+    --user-dir $FAIRSEQ_USER_DIR \
+    --task wsc --criterion wsc --wsc-cross-entropy \
+    --arch roberta_large --bpe gpt2 --max-positions 512 \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+    --lr-scheduler polynomial_decay --lr $LR \
+    --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+    --batch-size $MAX_SENTENCES \
+    --max-update $TOTAL_NUM_UPDATES \
+    --log-format simple --log-interval 100 \
+    --seed $SEED
+```
+
+The above command assumes training on 4 GPUs, but you can achieve the same
+results on a single GPU by adding `--update-freq=4`.
+
+### 3) Evaluate
+```python
+from fairseq.models.roberta import RobertaModel
+from examples.roberta.wsc import wsc_utils  # also loads WSC task and criterion
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'WSC/')
+roberta.cuda()
+nsamples, ncorrect = 0, 0
+for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True):
+    pred = roberta.disambiguate_pronoun(sentence)
+    nsamples += 1
+    if pred == label:
+        ncorrect += 1
+print('Accuracy: ' + str(ncorrect / float(nsamples)))
+# Accuracy: 0.9230769230769231
+```
+
+## RoBERTa training on WinoGrande dataset
+We have also provided `winogrande` task and criterion for finetuning on the
+[WinoGrande](https://mosaic.allenai.org/projects/winogrande) like datasets
+where there are always two candidates and one is correct.
+It's more efficient implementation for such subcases.
+
+```bash
+TOTAL_NUM_UPDATES=23750 # Total number of training steps.
+WARMUP_UPDATES=2375     # Linearly increase LR over this many steps.
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=32        # Batch size per GPU.
+SEED=1                  # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+# we use the --user-dir option to load the task and criterion
+# from the examples/roberta/wsc directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
+
+cd fairseq
+CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \
+  --restore-file $ROBERTA_PATH \
+  --reset-optimizer --reset-dataloader --reset-meters \
+  --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+  --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+  --valid-subset val \
+  --fp16 --ddp-backend legacy_ddp \
+  --user-dir $FAIRSEQ_USER_DIR \
+  --task winogrande --criterion winogrande \
+  --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \
+  --arch roberta_large --bpe gpt2 --max-positions 512 \
+  --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+  --lr-scheduler polynomial_decay --lr $LR \
+  --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+  --batch-size $MAX_SENTENCES \
+  --max-update $TOTAL_NUM_UPDATES \
+  --log-format simple --log-interval 100
+```
--- a/fairseq/examples/roberta/wsc/__init__.py
+++ b/fairseq/examples/roberta/wsc/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import wsc_criterion  # noqa
+from . import wsc_task  # noqa
--- a/fairseq/examples/roberta/wsc/wsc_criterion.py
+++ b/fairseq/examples/roberta/wsc/wsc_criterion.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import LegacyFairseqCriterion, register_criterion
+from fairseq.data import encoders
+
+
+@register_criterion("wsc")
+class WSCCriterion(LegacyFairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        if self.args.save_predictions is not None:
+            self.prediction_h = open(self.args.save_predictions, "w")
+        else:
+            self.prediction_h = None
+        self.bpe = encoders.build_bpe(args.bpe)
+        self.tokenizer = encoders.build_tokenizer(args.tokenizer)
+
+    def __del__(self):
+        if self.prediction_h is not None:
+            self.prediction_h.close()
+
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        parser.add_argument("--wsc-margin-alpha", type=float, metavar="A", default=1.0)
+        parser.add_argument("--wsc-margin-beta", type=float, metavar="B", default=0.0)
+        parser.add_argument(
+            "--wsc-cross-entropy",
+            action="store_true",
+            help="use cross entropy formulation instead of margin loss",
+        )
+        parser.add_argument(
+            "--save-predictions", metavar="FILE", help="file to save predictions to"
+        )
+
+    def get_masked_input(self, tokens, mask):
+        masked_tokens = tokens.clone()
+        masked_tokens[mask] = self.task.mask
+        return masked_tokens
+
+    def get_lprobs(self, model, tokens, mask):
+        logits, _ = model(src_tokens=self.get_masked_input(tokens, mask))
+        lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+        scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+        mask = mask.type_as(scores)
+        scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+        return scores
+
+    def get_loss(self, query_lprobs, cand_lprobs):
+        if self.args.wsc_cross_entropy:
+            return F.cross_entropy(
+                torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0),
+                query_lprobs.new([0]).long(),
+            )
+        else:
+            return (
+                -query_lprobs
+                + self.args.wsc_margin_alpha
+                * (cand_lprobs - query_lprobs + self.args.wsc_margin_beta).clamp(min=0)
+            ).sum()
+
+    def forward(self, model, sample, reduce=True):
+        # compute loss and accuracy
+        loss, nloss = 0.0, 0
+        ncorrect, nqueries = 0, 0
+
+        for i, label in enumerate(sample["labels"]):
+            query_lprobs = self.get_lprobs(
+                model,
+                sample["query_tokens"][i].unsqueeze(0),
+                sample["query_masks"][i].unsqueeze(0),
+            )
+            cand_lprobs = self.get_lprobs(
+                model,
+                sample["candidate_tokens"][i],
+                sample["candidate_masks"][i],
+            )
+
+            pred = (query_lprobs >= cand_lprobs).all().item()
+
+            if label is not None:
+                label = 1 if label else 0
+                ncorrect += 1 if pred == label else 0
+                nqueries += 1
+
+            if label:
+                # only compute a loss for positive instances
+                nloss += 1
+                loss += self.get_loss(query_lprobs, cand_lprobs)
+
+            id = sample["id"][i].item()
+            if self.prediction_h is not None:
+                print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h)
+
+        if nloss == 0:
+            loss = torch.tensor(0.0, requires_grad=True)
+
+        sample_size = nqueries if nqueries > 0 else 1
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "ncorrect": ncorrect,
+            "nqueries": nqueries,
+        }
+        return loss, sample_size, logging_output
+
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2),
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+
+        ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+        nqueries = sum(log.get("nqueries", 0) for log in logging_outputs)
+        if nqueries > 0:
+            agg_output["accuracy"] = ncorrect / float(nqueries)
+
+        return agg_output
+
+
+@register_criterion("winogrande")
+class WinograndeCriterion(WSCCriterion):
+    def forward(self, model, sample, reduce=True):
+        # compute loss and accuracy
+        query_lprobs = self.get_lprobs(
+            model,
+            sample["query_tokens"],
+            sample["query_masks"],
+        )
+        cand_lprobs = self.get_lprobs(
+            model,
+            sample["candidate_tokens"],
+            sample["candidate_masks"],
+        )
+        pred = query_lprobs >= cand_lprobs
+        loss = self.get_loss(query_lprobs, cand_lprobs)
+
+        sample_size = sample["query_tokens"].size(0)
+        ncorrect = pred.sum().item()
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "ncorrect": ncorrect,
+            "nqueries": sample_size,
+        }
+        return loss, sample_size, logging_output
--- a/fairseq/examples/roberta/wsc/wsc_task.py
+++ b/fairseq/examples/roberta/wsc/wsc_task.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import tempfile
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.data import (
+    Dictionary,
+    IdDataset,
+    ListDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    SortDataset,
+    data_utils,
+    encoders,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+
+from . import wsc_utils
+
+
+@register_task("wsc")
+class WSCTask(LegacyFairseqTask):
+    """Task to finetune RoBERTa for Winograd Schemas."""
+
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data", metavar="DIR", help="path to data directory; we load <split>.jsonl"
+        )
+        parser.add_argument(
+            "--init-token",
+            type=int,
+            default=None,
+            help="add token at the beginning of each batch item",
+        )
+
+    def __init__(self, args, vocab):
+        super().__init__(args)
+        self.vocab = vocab
+        self.mask = vocab.add_symbol("<mask>")
+
+        self.bpe = encoders.build_bpe(args)
+        self.tokenizer = encoders.build_tokenizer(args)
+
+        # hack to handle GPT-2 BPE, which includes leading spaces
+        if args.bpe == "gpt2":
+            self.leading_space = True
+            self.trailing_space = False
+        else:
+            self.leading_space = False
+            self.trailing_space = True
+
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol("<mask>")
+        return dictionary
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == "wsc", "Must set --criterion=wsc"
+
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
+        print("| dictionary: {} types".format(len(vocab)))
+
+        return cls(args, vocab)
+
+    def binarize(self, s: str, append_eos: bool = False):
+        if self.tokenizer is not None:
+            s = self.tokenizer.encode(s)
+        if self.bpe is not None:
+            s = self.bpe.encode(s)
+        tokens = self.vocab.encode_line(
+            s,
+            append_eos=append_eos,
+            add_if_not_exist=False,
+        ).long()
+        if self.args.init_token is not None:
+            tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+        return tokens
+
+    def binarize_with_mask(self, txt, prefix, suffix, leading_space, trailing_space):
+        toks = self.binarize(
+            prefix + leading_space + txt + trailing_space + suffix,
+            append_eos=True,
+        )
+        mask = torch.zeros_like(toks, dtype=torch.bool)
+        mask_start = len(self.binarize(prefix))
+        mask_size = len(self.binarize(leading_space + txt))
+        mask[mask_start : mask_start + mask_size] = 1
+        return toks, mask
+
+    def load_dataset(
+        self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
+    ):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + ".jsonl")
+        if not os.path.exists(data_path):
+            raise FileNotFoundError("Cannot find data: {}".format(data_path))
+
+        query_tokens = []
+        query_masks = []
+        query_lengths = []
+        candidate_tokens = []
+        candidate_masks = []
+        candidate_lengths = []
+        labels = []
+
+        for sentence, pronoun_span, query, label in wsc_utils.jsonl_iterator(data_path):
+            prefix = sentence[: pronoun_span.start].text
+            suffix = sentence[pronoun_span.end :].text_with_ws
+
+            # spaCy spans include trailing spaces, but we need to know about
+            # leading spaces for the GPT-2 BPE
+            leading_space = (
+                " " if sentence[: pronoun_span.start].text_with_ws.endswith(" ") else ""
+            )
+            trailing_space = " " if pronoun_span.text_with_ws.endswith(" ") else ""
+
+            # get noun phrases, excluding pronouns and anything overlapping with the query
+            cand_spans = wsc_utils.filter_noun_chunks(
+                wsc_utils.extended_noun_chunks(sentence),
+                exclude_pronouns=True,
+                exclude_query=query,
+                exact_match=False,
+            )
+
+            if query is not None:
+                query_toks, query_mask = self.binarize_with_mask(
+                    query, prefix, suffix, leading_space, trailing_space
+                )
+                query_len = len(query_toks)
+            else:
+                query_toks, query_mask, query_len = None, None, 0
+
+            query_tokens.append(query_toks)
+            query_masks.append(query_mask)
+            query_lengths.append(query_len)
+
+            cand_toks, cand_masks = [], []
+            for cand_span in cand_spans:
+                toks, mask = self.binarize_with_mask(
+                    cand_span.text,
+                    prefix,
+                    suffix,
+                    leading_space,
+                    trailing_space,
+                )
+                cand_toks.append(toks)
+                cand_masks.append(mask)
+
+            # collate candidates
+            cand_toks = data_utils.collate_tokens(cand_toks, pad_idx=self.vocab.pad())
+            cand_masks = data_utils.collate_tokens(cand_masks, pad_idx=0)
+            assert cand_toks.size() == cand_masks.size()
+
+            candidate_tokens.append(cand_toks)
+            candidate_masks.append(cand_masks)
+            candidate_lengths.append(cand_toks.size(1))
+
+            labels.append(label)
+
+        query_lengths = np.array(query_lengths)
+        query_tokens = ListDataset(query_tokens, query_lengths)
+        query_masks = ListDataset(query_masks, query_lengths)
+
+        candidate_lengths = np.array(candidate_lengths)
+        candidate_tokens = ListDataset(candidate_tokens, candidate_lengths)
+        candidate_masks = ListDataset(candidate_masks, candidate_lengths)
+
+        labels = ListDataset(labels, [1] * len(labels))
+
+        dataset = {
+            "id": IdDataset(),
+            "query_tokens": query_tokens,
+            "query_masks": query_masks,
+            "candidate_tokens": candidate_tokens,
+            "candidate_masks": candidate_masks,
+            "labels": labels,
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(query_tokens, reduce=True),
+        }
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[query_lengths],
+        )
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(query_tokens))
+        dataset = SortDataset(
+            nested_dataset,
+            # shuffle
+            sort_order=[shuffle],
+        )
+
+        if return_only:
+            return dataset
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
+
+    def build_dataset_for_inference(self, sample_json):
+        with tempfile.NamedTemporaryFile(buffering=0) as h:
+            h.write((json.dumps(sample_json) + "\n").encode("utf-8"))
+            dataset = self.load_dataset(
+                "disambiguate_pronoun",
+                data_path=h.name,
+                return_only=True,
+            )
+        return dataset
+
+    def disambiguate_pronoun(self, model, sentence, use_cuda=False):
+        sample_json = wsc_utils.convert_sentence_to_json(sentence)
+        dataset = self.build_dataset_for_inference(sample_json)
+        sample = dataset.collater([dataset[0]])
+        if use_cuda:
+            sample = utils.move_to_cuda(sample)
+
+        def get_masked_input(tokens, mask):
+            masked_tokens = tokens.clone()
+            masked_tokens[mask.bool()] = self.mask
+            return masked_tokens
+
+        def get_lprobs(tokens, mask):
+            logits, _ = model(src_tokens=get_masked_input(tokens, mask))
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+            scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+            mask = mask.type_as(scores)
+            scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+            return scores
+
+        cand_lprobs = get_lprobs(
+            sample["candidate_tokens"][0],
+            sample["candidate_masks"][0],
+        )
+        if sample["query_tokens"][0] is not None:
+            query_lprobs = get_lprobs(
+                sample["query_tokens"][0].unsqueeze(0),
+                sample["query_masks"][0].unsqueeze(0),
+            )
+            return (query_lprobs >= cand_lprobs).all().item() == 1
+        else:
+            best_idx = cand_lprobs.argmax().item()
+            full_cand = sample["candidate_tokens"][0][best_idx]
+            mask = sample["candidate_masks"][0][best_idx]
+            toks = full_cand[mask.bool()]
+            return self.bpe.decode(self.source_dictionary.string(toks)).strip()
+
+    @property
+    def source_dictionary(self):
+        return self.vocab
+
+    @property
+    def target_dictionary(self):
+        return self.vocab
+
+
+@register_task("winogrande")
+class WinograndeTask(WSCTask):
+    """
+    Task for WinoGrande dataset. Efficient implementation for Winograd schema
+    tasks with exactly two candidates, one of which is correct.
+    """
+
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == "winogrande", "Must set --criterion=winogrande"
+
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
+        print("| dictionary: {} types".format(len(vocab)))
+
+        return cls(args, vocab)
+
+    def load_dataset(
+        self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
+    ):
+        """Load a given dataset split.
+
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + ".jsonl")
+        if not os.path.exists(data_path):
+            raise FileNotFoundError("Cannot find data: {}".format(data_path))
+
+        query_tokens = []
+        query_masks = []
+        query_lengths = []
+        candidate_tokens = []
+        candidate_masks = []
+        candidate_lengths = []
+
+        itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=(split == "test"))
+
+        for sample in itr:
+            sentence, pronoun_span, query, cand_text = sample
+            prefix = sentence[: pronoun_span[0]].rstrip()
+            suffix = sentence[pronoun_span[1] :]
+
+            leading_space = " " if sentence[: pronoun_span[0]].endswith(" ") else ""
+            trailing_space = ""
+
+            if query is not None:
+                query_toks, query_mask = self.binarize_with_mask(
+                    query,
+                    prefix,
+                    suffix,
+                    leading_space,
+                    trailing_space,
+                )
+                query_len = len(query_toks)
+            else:
+                query_toks, query_mask, query_len = None, None, 0
+
+            query_tokens.append(query_toks)
+            query_masks.append(query_mask)
+            query_lengths.append(query_len)
+
+            cand_toks, cand_mask = self.binarize_with_mask(
+                cand_text,
+                prefix,
+                suffix,
+                leading_space,
+                trailing_space,
+            )
+
+            candidate_tokens.append(cand_toks)
+            candidate_masks.append(cand_mask)
+            candidate_lengths.append(cand_toks.size(0))
+
+        query_lengths = np.array(query_lengths)
+
+        def get_pad_dataset_fn(tokens, length, pad_idx):
+            return PadDataset(
+                ListDataset(tokens, length),
+                pad_idx=pad_idx,
+                left_pad=False,
+            )
+
+        query_tokens = get_pad_dataset_fn(query_tokens, query_lengths, self.vocab.pad())
+        query_masks = get_pad_dataset_fn(query_masks, query_lengths, 0)
+
+        candidate_lengths = np.array(candidate_lengths)
+        candidate_tokens = get_pad_dataset_fn(
+            candidate_tokens, candidate_lengths, self.vocab.pad()
+        )
+        candidate_masks = get_pad_dataset_fn(candidate_masks, candidate_lengths, 0)
+
+        dataset = {
+            "id": IdDataset(),
+            "query_tokens": query_tokens,
+            "query_masks": query_masks,
+            "candidate_tokens": candidate_tokens,
+            "candidate_masks": candidate_masks,
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(query_tokens, reduce=True),
+        }
+
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[query_lengths],
+        )
+
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(query_tokens))
+        dataset = SortDataset(
+            nested_dataset,
+            # shuffle
+            sort_order=[shuffle],
+        )
+
+        if return_only:
+            return dataset
+
+        self.datasets[split] = dataset
+        return self.datasets[split]
--- a/fairseq/examples/roberta/wsc/wsc_utils.py
+++ b/fairseq/examples/roberta/wsc/wsc_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+from functools import lru_cache
+
+
+def convert_sentence_to_json(sentence):
+    if "_" in sentence:
+        prefix, rest = sentence.split("_", 1)
+        query, rest = rest.split("_", 1)
+        query_index = len(prefix.rstrip().split(" "))
+    else:
+        query, query_index = None, None
+
+    prefix, rest = sentence.split("[", 1)
+    pronoun, rest = rest.split("]", 1)
+    pronoun_index = len(prefix.rstrip().split(" "))
+
+    sentence = sentence.replace("_", "").replace("[", "").replace("]", "")
+
+    return {
+        "idx": 0,
+        "text": sentence,
+        "target": {
+            "span1_index": query_index,
+            "span1_text": query,
+            "span2_index": pronoun_index,
+            "span2_text": pronoun,
+        },
+    }
+
+
+def extended_noun_chunks(sentence):
+    noun_chunks = {(np.start, np.end) for np in sentence.noun_chunks}
+    np_start, cur_np = 0, "NONE"
+    for i, token in enumerate(sentence):
+        np_type = token.pos_ if token.pos_ in {"NOUN", "PROPN"} else "NONE"
+        if np_type != cur_np:
+            if cur_np != "NONE":
+                noun_chunks.add((np_start, i))
+            if np_type != "NONE":
+                np_start = i
+            cur_np = np_type
+    if cur_np != "NONE":
+        noun_chunks.add((np_start, len(sentence)))
+    return [sentence[s:e] for (s, e) in sorted(noun_chunks)]
+
+
+def find_token(sentence, start_pos):
+    found_tok = None
+    for tok in sentence:
+        if tok.idx == start_pos:
+            found_tok = tok
+            break
+    return found_tok
+
+
+def find_span(sentence, search_text, start=0):
+    search_text = search_text.lower()
+    for tok in sentence[start:]:
+        remainder = sentence[tok.i :].text.lower()
+        if remainder.startswith(search_text):
+            len_to_consume = len(search_text)
+            start_idx = tok.idx
+            for next_tok in sentence[tok.i :]:
+                end_idx = next_tok.idx + len(next_tok.text)
+                if end_idx - start_idx == len_to_consume:
+                    span = sentence[tok.i : next_tok.i + 1]
+                    return span
+    return None
+
+
+@lru_cache(maxsize=1)
+def get_detokenizer():
+    from sacremoses import MosesDetokenizer
+
+    detok = MosesDetokenizer(lang="en")
+    return detok
+
+
+@lru_cache(maxsize=1)
+def get_spacy_nlp():
+    import en_core_web_lg
+
+    nlp = en_core_web_lg.load()
+    return nlp
+
+
+def jsonl_iterator(input_fname, positive_only=False, ngram_order=3, eval=False):
+    detok = get_detokenizer()
+    nlp = get_spacy_nlp()
+
+    with open(input_fname) as fin:
+        for line in fin:
+            sample = json.loads(line.strip())
+
+            if positive_only and "label" in sample and not sample["label"]:
+                # only consider examples where the query is correct
+                continue
+
+            target = sample["target"]
+
+            # clean up the query
+            query = target["span1_text"]
+            if query is not None:
+                if "\n" in query:
+                    continue
+                if query.endswith(".") or query.endswith(","):
+                    query = query[:-1]
+
+            # split tokens
+            tokens = sample["text"].split(" ")
+
+            def strip_pronoun(x):
+                return x.rstrip('.,"')
+
+            # find the pronoun
+            pronoun_idx = target["span2_index"]
+            pronoun = strip_pronoun(target["span2_text"])
+            if strip_pronoun(tokens[pronoun_idx]) != pronoun:
+                # hack: sometimes the index is misaligned
+                if strip_pronoun(tokens[pronoun_idx + 1]) == pronoun:
+                    pronoun_idx += 1
+                else:
+                    raise Exception("Misaligned pronoun!")
+            assert strip_pronoun(tokens[pronoun_idx]) == pronoun
+
+            # split tokens before and after the pronoun
+            before = tokens[:pronoun_idx]
+            after = tokens[pronoun_idx + 1 :]
+
+            # the GPT BPE attaches leading spaces to tokens, so we keep track
+            # of whether we need spaces before or after the pronoun
+            leading_space = " " if pronoun_idx > 0 else ""
+            trailing_space = " " if len(after) > 0 else ""
+
+            # detokenize
+            before = detok.detokenize(before, return_str=True)
+            pronoun = detok.detokenize([pronoun], return_str=True)
+            after = detok.detokenize(after, return_str=True)
+
+            # hack: when the pronoun ends in a period (or comma), move the
+            # punctuation to the "after" part
+            if pronoun.endswith(".") or pronoun.endswith(","):
+                after = pronoun[-1] + trailing_space + after
+                pronoun = pronoun[:-1]
+
+            # hack: when the "after" part begins with a comma or period, remove
+            # the trailing space
+            if after.startswith(".") or after.startswith(","):
+                trailing_space = ""
+
+            # parse sentence with spacy
+            sentence = nlp(before + leading_space + pronoun + trailing_space + after)
+
+            # find pronoun span
+            start = len(before + leading_space)
+            first_pronoun_tok = find_token(sentence, start_pos=start)
+            pronoun_span = find_span(sentence, pronoun, start=first_pronoun_tok.i)
+            assert pronoun_span.text == pronoun
+
+            if eval:
+                # convert to format where pronoun is surrounded by "[]" and
+                # query is surrounded by "_"
+                query_span = find_span(sentence, query)
+                query_with_ws = "_{}_{}".format(
+                    query_span.text,
+                    (" " if query_span.text_with_ws.endswith(" ") else ""),
+                )
+                pronoun_with_ws = "[{}]{}".format(
+                    pronoun_span.text,
+                    (" " if pronoun_span.text_with_ws.endswith(" ") else ""),
+                )
+                if query_span.start < pronoun_span.start:
+                    first = (query_span, query_with_ws)
+                    second = (pronoun_span, pronoun_with_ws)
+                else:
+                    first = (pronoun_span, pronoun_with_ws)
+                    second = (query_span, query_with_ws)
+                sentence = (
+                    sentence[: first[0].start].text_with_ws
+                    + first[1]
+                    + sentence[first[0].end : second[0].start].text_with_ws
+                    + second[1]
+                    + sentence[second[0].end :].text
+                )
+                yield sentence, sample.get("label", None)
+            else:
+                yield sentence, pronoun_span, query, sample.get("label", None)
+
+
+def winogrande_jsonl_iterator(input_fname, eval=False):
+    with open(input_fname) as fin:
+        for line in fin:
+            sample = json.loads(line.strip())
+            sentence, option1, option2 = (
+                sample["sentence"],
+                sample["option1"],
+                sample["option2"],
+            )
+
+            pronoun_span = (sentence.index("_"), sentence.index("_") + 1)
+
+            if eval:
+                query, cand = option1, option2
+            else:
+                query = option1 if sample["answer"] == "1" else option2
+                cand = option2 if sample["answer"] == "1" else option1
+            yield sentence, pronoun_span, query, cand
+
+
+def filter_noun_chunks(
+    chunks, exclude_pronouns=False, exclude_query=None, exact_match=False
+):
+    if exclude_pronouns:
+        chunks = [
+            np
+            for np in chunks
+            if (np.lemma_ != "-PRON-" and not all(tok.pos_ == "PRON" for tok in np))
+        ]
+
+    if exclude_query is not None:
+        excl_txt = [exclude_query.lower()]
+        filtered_chunks = []
+        for chunk in chunks:
+            lower_chunk = chunk.text.lower()
+            found = False
+            for excl in excl_txt:
+                if (
+                    not exact_match and (lower_chunk in excl or excl in lower_chunk)
+                ) or lower_chunk == excl:
+                    found = True
+                    break
+            if not found:
+                filtered_chunks.append(chunk)
+        chunks = filtered_chunks
+
+    return chunks
--- a/fairseq/examples/rxf/README.md
+++ b/fairseq/examples/rxf/README.md
+[Better Fine-Tuning by Reducing Representational Collapse](https://arxiv.org/abs/2008.03156)
+=====================
+This repo contains the code to replicate all experiments from the _Better Fine-Tuning by Reducing Representational Collapse_ paper excluding the probing results.
+
+The R3F sentence prediction criterion is registered as `sentence_prediction_r3f` while the label smoothing version of it is implemented as `label_smoothed_cross_entropy_r3f`. The R4F version of the sentence prediction criterion can be achieved by applying spectral norm to the classification head via the `--spectral-norm-classification-head` parameter.
+
+## Hyper-parameters
+Our methods introduce 3 new hyper-parameters; `--eps` which sets the standard deviation or range of the distribution we're sampling from, `--r3f-lambda` which controls the combining of logistic loss and noisy KL loss and `--noise-type` which controls which parametric distribution we use ('normal', 'uniform').
+
+For example to run R3F on RTE from GLUE
+
+```
+TOTAL_NUM_UPDATES=3120
+WARMUP_UPDATES=187
+LR=1e-05
+NUM_CLASSES=2
+MAX_SENTENCES=8        # Batch size.
+ROBERTA_PATH=/path/to/roberta/model.pt
+
+CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --max-sentences $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction_r3f \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --find-unused-parameters \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --noise-type uniform --r3f-lambda 0.7 \
+    --user-dir examples/rxf/rxf_src
+```
+
+## Citation
+```bibtex
+@article{aghajanyan2020better,
+  title={Better Fine-Tuning by Reducing Representational Collapse},
+  author={Aghajanyan, Armen and Shrivastava, Akshat and Gupta, Anchit and Goyal, Naman and Zettlemoyer, Luke and Gupta, Sonal},
+  journal={arXiv preprint arXiv:2008.03156},
+  year={2020}
+}
+```
--- a/fairseq/examples/rxf/__init__.py
+++ b/fairseq/examples/rxf/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import rxf_src  # noqa
--- a/fairseq/examples/rxf/rxf_src/__init__.py
+++ b/fairseq/examples/rxf/rxf_src/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from . import label_smoothed_cross_entropy_r3f, sentence_prediction_r3f  # noqa