Commit 7df61696 authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add fairseq0.10.2

parents
Pipeline #471 failed with stages
in 0 seconds
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
from typing import Any, Dict, Optional
import torch
import torch.nn as nn
from fairseq import metrics, utils
from fairseq.models import register_model, register_model_architecture
from fairseq.models.fairseq_encoder import EncoderOut
from fairseq.models.transformer import (
DEFAULT_MAX_SOURCE_POSITIONS,
DEFAULT_MAX_TARGET_POSITIONS,
TransformerDecoder,
TransformerEncoder,
TransformerModel,
base_architecture,
)
from torch import Tensor
logger = logging.getLogger(__name__)
@register_model("transformer_pointer_generator")
class TransformerPointerGeneratorModel(TransformerModel):
"""
Transformer model from `"Attention Is All You Need" (Vaswani et al, 2017)
<https://arxiv.org/abs/1706.03762>`_, augmented with a pointer-generator
network from `"Get To The Point: Summarization with Pointer-Generator
Networks" (See et al, 2017) <https://arxiv.org/abs/1704.04368>`_.
Args:
encoder (TransformerPointerGeneratorEncoder): the encoder
decoder (TransformerPointerGeneratorDecoder): the decoder
The Transformer pointer-generator model provides the following named
architectures and command-line arguments:
.. argparse::
:ref: fairseq.models.transformer_pointer_generator_parser
:prog:
"""
@staticmethod
def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
TransformerModel.add_args(parser)
parser.add_argument('--alignment-heads', type=int, metavar='N',
help='number of attention heads to be used for '
'pointing')
parser.add_argument('--alignment-layer', type=int, metavar='I',
help='layer number to be used for pointing (0 '
'corresponding to the bottommost layer)')
parser.add_argument('--source-position-markers', type=int, metavar='N',
help='dictionary includes N additional items that '
'represent an OOV token at a particular input '
'position')
parser.add_argument('--force-generation', type=float, metavar='P',
default=None,
help='set the vocabulary distribution weight to P, '
'instead of predicting it from the input (1.0 '
'corresponding to generation, 0.0 to pointing)')
# fmt: on
@classmethod
def build_model(cls, args, task):
"""Build a new model instance."""
# make sure all arguments are present in older models
base_architecture(args)
if args.encoder_layers_to_keep:
args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
if args.decoder_layers_to_keep:
args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
if getattr(args, "max_source_positions", None) is None:
args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
if getattr(args, "max_target_positions", None) is None:
args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
if getattr(args, "source_position_markers", None) is None:
args.source_position_markers = args.max_source_positions
src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
if src_dict != tgt_dict:
raise ValueError("Pointer-generator requires a joined dictionary")
def build_embedding(dictionary, embed_dim, path=None):
# The dictionary may include additional items that can be used in
# place of the normal OOV token and that all map to the same
# embedding. Using a different token for each input position allows
# one to restore the word identities from the original source text.
num_embeddings = len(dictionary) - args.source_position_markers
padding_idx = dictionary.pad()
unk_idx = dictionary.unk()
logger.info(
"dictionary indices from {0} to {1} will be mapped to {2}".format(
num_embeddings, len(dictionary) - 1, unk_idx
)
)
emb = Embedding(num_embeddings, embed_dim, padding_idx, unk_idx)
# if provided, load from preloaded dictionaries
if path:
embed_dict = utils.parse_embedding(path)
utils.load_embedding(embed_dict, dictionary, emb)
return emb
if args.share_all_embeddings:
if args.encoder_embed_dim != args.decoder_embed_dim:
raise ValueError(
"--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
)
if args.decoder_embed_path and (
args.decoder_embed_path != args.encoder_embed_path
):
raise ValueError(
"--share-all-embeddings not compatible with --decoder-embed-path"
)
encoder_embed_tokens = build_embedding(
src_dict, args.encoder_embed_dim, args.encoder_embed_path
)
decoder_embed_tokens = encoder_embed_tokens
args.share_decoder_input_output_embed = True
else:
encoder_embed_tokens = build_embedding(
src_dict, args.encoder_embed_dim, args.encoder_embed_path
)
decoder_embed_tokens = build_embedding(
tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
)
encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
return cls(args, encoder, decoder)
@classmethod
def build_encoder(cls, args, src_dict, embed_tokens):
return TransformerPointerGeneratorEncoder(args, src_dict, embed_tokens)
@classmethod
def build_decoder(cls, args, tgt_dict, embed_tokens):
return TransformerPointerGeneratorDecoder(args, tgt_dict, embed_tokens)
class TransformerPointerGeneratorEncoder(TransformerEncoder):
"""
Transformer encoder consisting of *args.encoder_layers* layers. Each layer
is a :class:`TransformerEncoderLayer`. The pointer-generator variant adds
the source tokens to the encoder output as these are otherwise not passed
to the decoder.
"""
def forward(self, src_tokens, src_lengths, **kwargs):
"""
Runs the `forward()` method of the parent Transformer class. Then adds
the source tokens into the encoder output tuple.
While it might be more elegant that the model would pass the source
tokens to the `forward()` method of the decoder too, this would require
changes to `SequenceGenerator`.
Args:
src_tokens (torch.LongTensor): tokens in the source language of
shape `(batch, src_len)`
src_lengths (torch.LongTensor): lengths of each source sentence of
shape `(batch)`
Returns:
namedtuple:
- **encoder_out** (Tensor): the last encoder layer's output of
shape `(src_len, batch, embed_dim)`
- **encoder_padding_mask** (ByteTensor): the positions of
padding elements of shape `(batch, src_len)`
- **encoder_embedding** (Tensor): the (scaled) embedding lookup
of shape `(batch, src_len, embed_dim)`
- **encoder_states** (List[Tensor]): all intermediate
hidden states of shape `(src_len, batch, embed_dim)`.
Only populated if *return_all_hiddens* is True.
- **src_tokens** (Tensor): input token ids of shape
`(batch, src_len)`
"""
encoder_out = super().forward(src_tokens, src_lengths, **kwargs)
return EncoderOut(
encoder_out=encoder_out.encoder_out, # T x B x C
encoder_padding_mask=encoder_out.encoder_padding_mask, # B x T
encoder_embedding=encoder_out.encoder_embedding, # B x T x C
encoder_states=encoder_out.encoder_states, # List[T x B x C]
src_tokens=src_tokens, # B x T
src_lengths=None,
)
class TransformerPointerGeneratorDecoder(TransformerDecoder):
"""
Transformer decoder consisting of *args.decoder_layers* layers. Each layer
is a :class:`TransformerDecoderLayer`. The pointer-generator variant mixes
the output probabilities with an attention distribution in the output layer.
Args:
args (argparse.Namespace): parsed command-line arguments
dictionary (~fairseq.data.Dictionary): decoding dictionary
embed_tokens (torch.nn.Embedding): output embedding
"""
def __init__(self, args, dictionary, embed_tokens):
super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False)
# In the pointer-generator model these arguments define the decoder
# layer and the number of attention heads that will be averaged to
# create the alignment for pointing.
self.alignment_heads = args.alignment_heads
self.alignment_layer = args.alignment_layer
input_embed_dim = embed_tokens.embedding_dim
# Generation probabilities / interpolation coefficients are predicted
# from the current decoder input embedding and the decoder output, which
# is the size of output_embed_dim.
p_gen_input_size = input_embed_dim + self.output_embed_dim
self.project_p_gens = nn.Linear(p_gen_input_size, 1)
nn.init.zeros_(self.project_p_gens.bias)
# The dictionary may include a separate entry for an OOV token in each
# input position, so that their identity can be restored from the
# original source text.
self.num_types = len(dictionary)
self.num_oov_types = args.source_position_markers
self.num_embeddings = self.num_types - self.num_oov_types
self.force_p_gen = args.force_generation
def forward(
self,
prev_output_tokens,
encoder_out: Optional[EncoderOut] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
features_only: bool = False,
alignment_layer: Optional[int] = 0,
alignment_heads: Optional[int] = 1,
src_lengths: Optional[Any] = None,
return_all_hiddens: bool = False,
):
"""
Args:
prev_output_tokens (LongTensor): previous decoder outputs of shape
`(batch, tgt_len)`, for teacher forcing
encoder_out (EncoderOut, optional): output from the encoder, used
for encoder-side attention
incremental_state (dict, optional): dictionary used for storing
state during :ref:`Incremental decoding`
features_only (bool, optional): only return features without
applying output layer (default: False)
alignment_layer (int, optional): 0-based index of the layer to be
used for pointing (default: 0)
alignment_heads (int, optional): number of attention heads to be
used for pointing (default: 1)
Returns:
tuple:
- the decoder's output of shape `(batch, tgt_len, vocab)`
- a dictionary with any model-specific outputs
"""
# The normal Transformer model doesn't pass the alignment_layer and
# alignment_heads parameters correctly. We use our local variables.
x, extra = self.extract_features(
prev_output_tokens,
encoder_out=encoder_out,
incremental_state=incremental_state,
alignment_layer=self.alignment_layer,
alignment_heads=self.alignment_heads,
)
if not features_only:
# Embedding the tokens again for generation probability prediction,
# so that we don't have to reimplement the whole extract_features()
# method.
if incremental_state is not None:
prev_output_tokens = prev_output_tokens[:, -1:]
prev_output_embed = self.embed_tokens(prev_output_tokens)
prev_output_embed *= self.embed_scale
predictors = torch.cat((prev_output_embed, x), 2)
p_gens = self.project_p_gens(predictors)
p_gens = torch.sigmoid(p_gens)
x = self.output_layer(x, extra["attn"][0], encoder_out.src_tokens, p_gens)
return x, extra
def output_layer(self, features, attn, src_tokens, p_gens, **kwargs):
"""
Project features to the vocabulary size and mix with the attention
distributions.
"""
if self.force_p_gen is not None:
p_gens = self.force_p_gen
# project back to size of vocabulary
logits = super().output_layer(features, **kwargs)
batch_size = logits.shape[0]
output_length = logits.shape[1]
assert logits.shape[2] == self.num_embeddings
assert src_tokens.shape[0] == batch_size
src_length = src_tokens.shape[1]
# The final output distribution will be a mixture of the normal output
# distribution (softmax of logits) and attention weights.
gen_dists = super().get_normalized_probs(
(logits, None), log_probs=False, sample=None
)
gen_dists = torch.mul(gen_dists, p_gens)
padding_size = (batch_size, output_length, self.num_oov_types)
padding = gen_dists.new_zeros(padding_size)
gen_dists = torch.cat((gen_dists, padding), 2)
assert gen_dists.shape[2] == self.num_types
# Scatter attention distributions to distributions over the extended
# vocabulary in a tensor of shape [batch_size, output_length,
# vocab_size]. Each attention weight will be written into a location
# that is for other dimensions the same as in the index tensor, but for
# the third dimension it's the value of the index tensor (the token ID).
attn = torch.mul(attn, 1 - p_gens)
index = src_tokens[:, None, :]
index = index.expand(batch_size, output_length, src_length)
attn_dists_size = (batch_size, output_length, self.num_types)
attn_dists = attn.new_zeros(attn_dists_size)
attn_dists.scatter_add_(2, index, attn)
# Final distributions, [batch_size, output_length, num_types].
return gen_dists + attn_dists
def get_normalized_probs(self, net_output, log_probs, sample):
"""
Get normalized probabilities (or log probs) from a net's output.
Pointer-generator network output is already normalized.
"""
probs = net_output[0]
# Make sure the probabilities are greater than zero when returning log
# probabilities.
return probs.clamp(1e-10, 1.0).log() if log_probs else probs
class Embedding(nn.Embedding):
r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
This module is often used to store word embeddings and retrieve them using indices.
The input to the module is a list of indices, and the output is the corresponding
word embeddings. This subclass differs from the standard PyTorch Embedding class by
allowing additional vocabulary entries that will be mapped to the unknown token
embedding.
Args:
num_embeddings (int): size of the dictionary of embeddings
embedding_dim (int): the size of each embedding vector
padding_idx (int): Pads the output with the embedding vector at :attr:`padding_idx`
(initialized to zeros) whenever it encounters the index.
unk_idx (int): Maps all token indices that are greater than or equal to
num_embeddings to this index.
Attributes:
weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
initialized from :math:`\mathcal{N}(0, 1)`
Shape:
- Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
- Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
.. note::
Keep in mind that only a limited number of optimizers support
sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
:class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
.. note::
With :attr:`padding_idx` set, the embedding vector at
:attr:`padding_idx` is initialized to all zeros. However, note that this
vector can be modified afterwards, e.g., using a customized
initialization method, and thus changing the vector used to pad the
output. The gradient for this vector from :class:`~torch.nn.Embedding`
is always zero.
"""
__constants__ = ["unk_idx"]
def __init__(self, num_embeddings, embedding_dim, padding_idx, unk_idx):
super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
self.unk_idx = unk_idx
nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5)
nn.init.constant_(self.weight[padding_idx], 0)
def forward(self, input):
input = torch.where(
input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input
)
return super().forward(input)
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator"
)
def transformer_pointer_generator(args):
args.alignment_heads = getattr(args, "alignment_heads", 1)
args.alignment_layer = getattr(args, "alignment_layer", -1)
base_architecture(args)
if args.alignment_layer < 0:
args.alignment_layer = args.decoder_layers + args.alignment_layer
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_iwslt_de_en"
)
def transformer_pointer_generator_iwslt_de_en(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
args.encoder_layers = getattr(args, "encoder_layers", 6)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
args.decoder_layers = getattr(args, "decoder_layers", 6)
transformer_pointer_generator(args)
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de"
)
def transformer_pointer_generator_wmt_en_de(args):
transformer_pointer_generator(args)
# Transformer pointer-generator with the base Transformer parameters as used in
# the "Attention Is All You Need" paper (Vaswani et al., 2017)
@register_model_architecture(
"transformer_pointer_generator",
"transformer_pointer_generator_vaswani_wmt_en_de_big",
)
def transformer_pointer_generator_vaswani_wmt_en_de_big(args):
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
args.dropout = getattr(args, "dropout", 0.3)
transformer_pointer_generator(args)
@register_model_architecture(
"transformer_pointer_generator",
"transformer_pointer_generator_vaswani_wmt_en_fr_big",
)
def transformer_pointer_generator_vaswani_wmt_en_fr_big(args):
args.dropout = getattr(args, "dropout", 0.1)
transformer_pointer_generator_vaswani_wmt_en_de_big(args)
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big"
)
def transformer_pointer_generator_wmt_en_de_big(args):
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
transformer_pointer_generator_vaswani_wmt_en_de_big(args)
# default parameters used in tensor2tensor implementation
@register_model_architecture(
"transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big_t2t"
)
def transformer_pointer_generator_wmt_en_de_big_t2t(args):
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
args.activation_dropout = getattr(args, "activation_dropout", 0.1)
transformer_pointer_generator_vaswani_wmt_en_de_big(args)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import re
import sys
class OOVIndexError(IndexError):
def __init__(self, pos, source_seq, target_seq):
super(OOVIndexError, self).__init__(
"A <unk-N> tag in the target sequence refers to a position that is "
"outside the source sequence. Most likely there was a mismatch in "
"provided source and target sequences. Otherwise this would mean that "
"the pointing mechanism somehow attended to a position that is past "
"the actual sequence end."
)
self.source_pos = pos
self.source_seq = source_seq
self.target_seq = target_seq
def replace_oovs(source_in, target_in, target_out):
"""Replaces <unk-N> tokens in the target text with the corresponding word in
the source text.
"""
oov_re = re.compile("^<unk-([0-9]+)>$")
for source_seq, target_seq in zip(source_in, target_in):
target_seq_out = []
pos_to_word = source_seq.strip().split()
for token in target_seq.strip().split():
m = oov_re.match(token)
if m:
pos = int(m.group(1))
if pos >= len(pos_to_word):
raise OOVIndexError(pos, source_seq, target_seq)
token_out = pos_to_word[pos]
else:
token_out = token
target_seq_out.append(token_out)
target_out.write(" ".join(target_seq_out) + "\n")
def main():
parser = argparse.ArgumentParser(
description="Replaces <unk-N> tokens in target sequences with words from "
"the corresponding position in the source sequence."
)
parser.add_argument(
"--source", type=str, help="text file with source sequences", required=True
)
parser.add_argument(
"--target", type=str, help="text file with target sequences", required=True
)
parser.add_argument(
"--target-out",
type=str,
help="where to write target sequences without <unk-N> " "entries",
required=True,
)
args = parser.parse_args()
target_in = (
open(args.target, "r", encoding="utf-8") if args.target is not None else None
)
target_out = (
open(args.target_out, "w", encoding="utf-8")
if args.target_out is not None
else None
)
with open(args.source, "r", encoding="utf-8") as source_in, open(
args.target, "r", encoding="utf-8"
) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out:
replace_oovs(source_in, target_in, target_out)
if __name__ == "__main__":
try:
main()
except OOVIndexError as e:
print(e, file=sys.stderr)
print("Source sequence:", e.source_seq.strip(), file=sys.stderr)
print("Target sequence:", e.target_seq.strip(), file=sys.stderr)
print(
"Source sequence length:",
len(e.source_seq.strip().split()),
file=sys.stderr,
)
print("The offending tag points to:", e.source_pos)
sys.exit(2)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
from itertools import zip_longest
def replace_oovs(source_in, target_in, vocabulary, source_out, target_out):
"""Replaces out-of-vocabulary words in source and target text with <unk-N>,
where N in is the position of the word in the source sequence.
"""
def format_unk(pos):
return "<unk-{}>".format(pos)
if target_in is None:
target_in = []
for seq_num, (source_seq, target_seq) in enumerate(
zip_longest(source_in, target_in)
):
source_seq_out = []
target_seq_out = []
word_to_pos = dict()
for position, token in enumerate(source_seq.strip().split()):
if token in vocabulary:
token_out = token
else:
if token in word_to_pos:
oov_pos = word_to_pos[token]
else:
word_to_pos[token] = position
oov_pos = position
token_out = format_unk(oov_pos)
source_seq_out.append(token_out)
source_out.write(" ".join(source_seq_out) + "\n")
if target_seq is not None:
for token in target_seq.strip().split():
if token in word_to_pos:
token_out = format_unk(word_to_pos[token])
else:
token_out = token
target_seq_out.append(token_out)
if target_out is not None:
target_out.write(" ".join(target_seq_out) + "\n")
def main():
parser = argparse.ArgumentParser(
description="Replaces out-of-vocabulary words in both source and target "
"sequences with tokens that indicate the position of the word "
"in the source sequence."
)
parser.add_argument(
"--source", type=str, help="text file with source sequences", required=True
)
parser.add_argument(
"--target", type=str, help="text file with target sequences", default=None
)
parser.add_argument("--vocab", type=str, help="vocabulary file", required=True)
parser.add_argument(
"--source-out",
type=str,
help="where to write source sequences with <unk-N> entries",
required=True,
)
parser.add_argument(
"--target-out",
type=str,
help="where to write target sequences with <unk-N> entries",
default=None,
)
args = parser.parse_args()
with open(args.vocab, encoding="utf-8") as vocab:
vocabulary = vocab.read().splitlines()
target_in = (
open(args.target, "r", encoding="utf-8") if args.target is not None else None
)
target_out = (
open(args.target_out, "w", encoding="utf-8")
if args.target_out is not None
else None
)
with open(args.source, "r", encoding="utf-8") as source_in, open(
args.source_out, "w", encoding="utf-8"
) as source_out:
replace_oovs(source_in, target_in, vocabulary, source_out, target_out)
if target_in is not None:
target_in.close()
if target_out is not None:
target_out.close()
if __name__ == "__main__":
main()
# Training with Quantization Noise for Extreme Model Compression ({Fan\*, Stock\*} *et al.*, 2020)
This page contains information for how to train and quantize models with Quantization Noise, for both scalar quantization like `int8` and Iterative Product Quantization.
Check out our paper [here](https://arxiv.org/abs/2004.07320).
Looking for pretrained models? They will be added shortly.
Looking for code to train vision models? We are working on open sourcing our code as part of ClassyVision. Please check back, but note that both the Scalar and Iterative Product Quantization counterparts of the `nn.Conv2d` module are already included in this release.
**Contents**:
- [Walk through of code](#walk-through-the-code)
- [Reproduce NLP Results](#looking-to-reproduce-the-nlp-results-in-the-paper)
- [Reproduce Vision Results](#looking-to-reproduce-the-vision-results-in-the-paper)
## Citation
```bibtex
@article{fan2020training,
title={Training with Quantization Noise for Extreme Model Compression},
author={Angela Fan* and Pierre Stock* and and Benjamin Graham and Edouard Grave and Remi Gribonval and Herve Jegou and Armand Joulin},
year={2020},
eprint={2004.07320},
archivePrefix={arXiv},
primaryClass={cs.ML}
}
```
## Walk through the code
Training a model with Quant-Noise improves the performance in subsequent inference-time quantization by training models to be robust to quantization. This technique is useful for both scalar and product quantization methods, as well as multiple domains. We detail below our approach to train, quantize models and integrate our code to quantize your favorite models.
### Scalar Quantization
Unlike the section [Iterative Product Quantization](#iterative-product-quantization) which gives state-of-the-art compression, this section showcases the usefulness of our approach for simple scalar quantization baselines such as int8 using on-GPU Fake Quantization.
#### Training
Scalar quantization with Quant-Noise consists in randomly quantizing a proportion `p` of the weights during training. Scalar quantization is implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar) under the form of Fake Quantization, meaning that we emulate int8 on GPU by quantizing and de-quantizing both the weights and the activations. We rely on PyTorch's [quantization primitives](https://github.com/pytorch/pytorch/tree/master/torch/quantization).
To train a model with Quant-Noise, add the following flag:
```
--quant-noise-scalar 0.5
```
Large values of noise make the network easier to quantize but may result in higher non-quantized test and validation perplexities.
#### Quantization
When evaluating a network, all quantized modules and activation hooks automatically switch to `p=1` so the validation accuracy reported by Fairseq is actually the quantized one, nothing more to do.
#### Integration with your own code
Looking to quantize your own models with Quant-Noise + Scalar Quantization?
- Use the function `quantize_model_` implemented [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/scalar/utils.py) to (1) replace all your modules by their quantized counterparts and (2) add hooks to those modules to quantize the activations.
- Then, perform your training as usual. Note that in `eval()` mode, the network is always fully quantized (weights and activations) by default (`p=1`).
### Iterative Product Quantization
Iterative Product Quantization with Quant-Noise proceeds in two steps. First, a model must be trained uncompressed with Quant-Noise. Second, the model must be quantized with iPQ. Note that we implement here the simplest form of noise, which consists in randomly dropping a proportion `p` of blocks, and that worked as well as assigning those blocks to their current centroid.
#### Training
To train a model with Quant-Noise, add the following flags:
```
--quant-noise-pq 0.1 --quant-noise-pq-block-size 8
```
`quant-noise-pq` controls how much dropout is applied to the blocks of the weight matrix. `quant-noise-pq-block-size` controls the size of the weight matrix blocks.
We recommend training with 0.05 to 0.2 Quant-Noise, a value that worked well in our experiments. For the block-size, we recommend training with block-size of 8. Note that the block size must be a multiple of `input_features`, see the size checks [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py). Large block sizes result in higher compression ratio but may induce a loss in accuracy.
We currently support training Transformer based models, such as sequence-to-sequence, language models, and BERT architectures. The `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py) wraps a module. It splits a weight matrix into blocks and applies random dropout to these blocks.
In the Transformer architectures, quant-noise is applied to the input and output embeddings, the attention, and the FFN.
Quant-Noise can also be combined with **LayerDrop** (see [here](https://github.com/pytorch/fairseq/tree/master/examples/layerdrop)) to add its pruning effect to the quantized model and make the model even smaller. We recommend training with LayerDrop 0.1 or 0.2.
#### Quantization
We implement an improved version of product quantization from Stock et al, **iPQ**, described [here](https://arxiv.org/abs/1907.05686), see code with old API [here](https://github.com/facebookresearch/kill-the-bits). Note that we improved the iPQ API in terms of both compute speed and usability as described below.
For the particular case of PQ, quantization is made sequentially. We recommend first quantizing the FFNs, then the EMBs, and finally the ATTNs. Quantization is done in two sub-steps:
- First, perform `n` steps of Product Quantization (generally `n=20` is enough).
- Then, finetune the obtained centroids.
#### Integration with your own code
Looking to quantize your own models with Quant-Noise + iPQ?
- First wrap your modules with the `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quant_noise.py), which is module-agnostic and train your favorite model.
- Then, quantize your trained model using the code [here](https://github.com/pytorch/fairseq/tree/master/fairseq/modules/quantization/pq). This can be done *without any changes to your training loop*. Below is an example code for integration.
Note that we tried our approach only on Transformers and various Convolutional Models such as EfficientNets.
```python
from fairseq.modules.quantization.pq import quantize_model_, SizeTracker
# get configuration parameters
n_centroids_config = config["n_centroids"]
block_sizes_config = config["block_sizes"]
layers_to_quantize = config["layers_to_quantize"]
# size tracker for keeping track of assignments, centroids and non-compressed sizes
size_tracker = SizeTracker(model)
# Quantize model by stages
for step in range(len(layers_to_quantize)):
# quantize model in-place
quantized_layers = quantize_model_(
model,
size_tracker,
layers_to_quantize,
block_sizes_config,
n_centroids_config,
step=step,
)
logger.info(f"Finetuning stage {step}, quantized layers: {quantized_layers}")
logger.info(f"{size_tracker}")
# Don't forget to re-create/update trainer/optimizer since model parameters have changed
optimizer = ...
# Finetune the centroids with your usual training loop for a few epochs
trainer.train_epoch()
```
## Looking to reproduce the NLP results in the paper?
We detail below how to reproduce the state-of-the-art results in reported in the paper for Quant-Noise + Iterative Product Quantization.
### Training with Quant-Noise
To **train** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/roberta).
The following command can be used to train a RoBERTa Base + QuantNoise model:
```bash
TOTAL_UPDATES=125000
WARMUP_UPDATES=10000
PEAK_LR=0.0005
TOKENS_PER_SAMPLE=512
MAX_POSITIONS=512
MAX_SENTENCES=16
UPDATE_FREQ=2
DATA_DIR=/path/to/data/here
fairseq-train $DATA_DIR \
--task masked_lm --criterion masked_lm --arch roberta_base \
--sample-break-mode complete \
--tokens-per-sample $TOKENS_PER_SAMPLE --max-positions $MAX_POSITIONS \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $PEAK_LR \
--warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.01 \
--batch-size $MAX_SENTENCES \
--update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \
--save-dir checkpoint/roberta \
--ddp-backend no_c10d --encoder-layerdrop 0.2 \
--quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta
```
To **finetune** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md).
The following command can be used to finetune a RoBERTa Base + QuantNoise model on the RTE dataset:
```bash
TOTAL_NUM_UPDATES=2036
WARMUP_UPDATES=122
LR=2e-05
NUM_CLASSES=2
MAX_SENTENCES=16
ROBERTA_PATH=/path/to/roberta_quantnoise/model.pt
fairseq-train /path/to/rte/data/ \
--restore-file $ROBERTA_PATH \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--ddp-backend no_c10d \
--quant-noise-pq 0.2 --quant-noise-pq-block-size 8
```
To **train** Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/master/examples/language_model).
The following command can be used to train a Transformer + QuantNoise model on Wikitext-103:
```bash
fairseq-train --task language_modeling /path/to/wikitext-103/data \
--save-dir checkpoints/transformer_wikitext-103 \
--adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \
--adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \
--tie-adaptive-proj --tie-adaptive-weights \
--arch transformer_lm_gbw \
--attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
--clip-norm 0.1 --criterion adaptive_loss \
--ddp-backend no_c10d \
--decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \
--decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
--lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 1.0 --t-mult 2.0 \
--max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \
--sample-break-mode none --update-freq 3 \
--warmup-init-lr 1e-07 --warmup-updates 16000 \
--weight-decay 0 --seed 1 --min-lr 1e-09 \
--quant-noise-pq 0.05 --quant-noise-pq-block-size 8
```
To **evaluate** this model, note you need to use the `eval.py` script. The following command can be used to evaluate:
```bash
fairseq-eval-lm /path/to/wikitext-103/data --path /path/to/model/checkpoint \
--sample-break-mode complete \
--max-tokens 3072 \
--context-window 2560 \
--softmax-batch 1024 \
--gen-subset valid
```
and change the `--gen-subset` to `test` if you would like to evaluate on the test set instead.
### Iterative Product Quantization
To quantize the finetuned RoBERTa model, we use this command on 1 GPU. This should run in a day.
```bash
TOTAL_NUM_UPDATES=6108 # 2036 updates for each iteration
WARMUP_UPDATES=122
LR=2e-05
NUM_CLASSES=2
MAX_SENTENCES=16
fairseq-train --task sentence_prediction /path/to/data/ \
--restore-file $ROBERTA_PATH \
--save-dir checkpoints/roberta_finetuned \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 --lr-scheduler polynomial_decay \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend no_c10d \
--quantization-config-path /path/to/config/yaml
```
To quantize the trained Language Model, we use this command on 8 V100 23GB GPUs. This should run in a couple of hours.
```bash
fairseq-train --task language_modeling /path/to/wikitext-103/data \
--save-dir checkpoints/transformer_wikitext-103 \
--adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \
--adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \
--arch transformer_lm_gbw \
--attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \
--bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \
--clip-norm 0.1 --criterion adaptive_loss \
--ddp-backend no_c10d \
--decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \
--fp16 --keep-last-epochs -1 \
--lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --max-lr 0.05 --min-lr 1e-09 \
--max-tokens 2944 --tokens-per-sample 2944\
--momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \
--sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \
--tie-adaptive-proj --tie-adaptive-weights --update-freq 3 --weight-decay 0 --seed 1 \
--log-interval 100 --no-progress-bar --skip-invalid-size-inputs-valid-test \
--restore-file path/to/trained/lm/with/quant/noise \
--max-update 13500 --quantization-config-path /path/to/config/yaml
```
If you have less capacity or if your distributed training freezes, try reducing `--max-tokens` and `--tokens-per-sample` (this may reduce the quantized accuracy a bit).
### Remarks
We try to keep the open-sourced code as readable and as easy-to-plug as possible. Therefore, we did not test it for the following cases:
- Scalar quantization with RoBERTa.
- Quantization with iPQ and `int8` combined.
If you have trouble adapting it, we will be more than happy to help!
## Looking to reproduce the Vision results in the paper?
We are working on open sourcing our code as part of ClassyVision. Please check back.
## Having an issue or have a question?
Please open an issue in this repository with the details of your question. Thanks!
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# This file defines example configuration arguments for quantizing
# a transformer model with product quantization
# Number of Centroids for Product Quantization, by default 256 (byte-aligned)
n_centroids:
Linear:
key: in_features
value: {"*": 256}
Embedding:
key: embedding_dim
value: {"*": 256}
# Block Sizes for Product Quantization
# We suggest: 8 for FFN, 4 for ATTN, 4 for embedding projections, 8 for embeddings
block_sizes:
Linear:
key: fuzzy_name
value: {fc: 8, attn: 4, emb: 4}
Embedding:
key: fuzzy_name
value: {emb: 8}
# Layers to Quantize Sequentially
# We suggest: first FFN, then EMB, then ATTN
layers_to_quantize:
- decoder\\.layers\\.\d+\\.fc[12]
- decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]
- decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)
# Finetuning RoBERTa on a custom classification task
This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks.
### 1) Get the data
```bash
wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar zxvf aclImdb_v1.tar.gz
```
### 2) Format data
`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing.
```python
import argparse
import os
import random
from glob import glob
random.seed(0)
def main(args):
for split in ['train', 'test']:
samples = []
for class_label in ['pos', 'neg']:
fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt')
for fname in fnames:
with open(fname) as fin:
line = fin.readline()
samples.append((line, 1 if class_label == 'pos' else 0))
random.shuffle(samples)
out_fname = 'train' if split == 'train' else 'dev'
f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w')
f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w')
for sample in samples:
f1.write(sample[0] + '\n')
f2.write(str(sample[1]) + '\n')
f1.close()
f2.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--datadir', default='aclImdb')
args = parser.parse_args()
main(args)
```
### 3) BPE encode
Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower.
```bash
# Download encoder.json and vocab.bpe
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
for SPLIT in train dev; do
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "aclImdb/$SPLIT.input0" \
--outputs "aclImdb/$SPLIT.input0.bpe" \
--workers 60 \
--keep-empty
done
```
### 4) Preprocess data
```bash
# Download fairseq dictionary.
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
fairseq-preprocess \
--only-source \
--trainpref "aclImdb/train.input0.bpe" \
--validpref "aclImdb/dev.input0.bpe" \
--destdir "IMDB-bin/input0" \
--workers 60 \
--srcdict dict.txt
fairseq-preprocess \
--only-source \
--trainpref "aclImdb/train.label" \
--validpref "aclImdb/dev.label" \
--destdir "IMDB-bin/label" \
--workers 60
```
### 5) Run training
```bash
TOTAL_NUM_UPDATES=7812 # 10 epochs through IMDB for bsz 32
WARMUP_UPDATES=469 # 6 percent of the number of updates
LR=1e-05 # Peak LR for polynomial LR scheduler.
HEAD_NAME=imdb_head # Custom name for the classification head.
NUM_CLASSES=2 # Number of classes for the classification task.
MAX_SENTENCES=8 # Batch size.
ROBERTA_PATH=/path/to/roberta.large/model.pt
CUDA_VISIBLE_DEVICES=0 fairseq-train IMDB-bin/ \
--restore-file $ROBERTA_PATH \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--classification-head-name $HEAD_NAME \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--shorten-method "truncate" \
--find-unused-parameters \
--update-freq 4
```
The above command will finetune RoBERTa-large with an effective batch-size of 32
sentences (`--batch-size=8 --update-freq=4`). The expected
`best-validation-accuracy` after 10 epochs is ~96.5%.
If you run out of GPU memory, try decreasing `--batch-size` and increase
`--update-freq` to compensate.
### 6) Load model using hub interface
Now we can load the trained model checkpoint using the RoBERTa hub interface.
Assuming your checkpoints are stored in `checkpoints/`:
```python
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained(
'checkpoints',
checkpoint_file='checkpoint_best.pt',
data_name_or_path='IMDB-bin'
)
roberta.eval() # disable dropout
```
Finally you can make predictions using the `imdb_head` (or whatever you set
`--classification-head-name` to during training):
```python
label_fn = lambda label: roberta.task.label_dictionary.string(
[label + roberta.task.label_dictionary.nspecial]
)
tokens = roberta.encode('Best movie this year')
pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
assert pred == '1' # positive
tokens = roberta.encode('Worst movie ever')
pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
assert pred == '0' # negative
```
# Finetuning RoBERTa on GLUE tasks
### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands:
```bash
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
python download_glue_data.py --data_dir glue_data --tasks all
```
### 2) Preprocess GLUE task data:
```bash
./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name>
```
`glue_task_name` is one of the following:
`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}`
Use `ALL` for preprocessing all the glue tasks.
### 3) Fine-tuning on GLUE task:
Example fine-tuning cmd for `RTE` task
```bash
TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16
WARMUP_UPDATES=122 # 6 percent of the number of updates
LR=2e-05 # Peak LR for polynomial LR scheduler.
NUM_CLASSES=2
MAX_SENTENCES=16 # Batch size.
ROBERTA_PATH=/path/to/roberta/model.pt
CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \
--restore-file $ROBERTA_PATH \
--max-positions 512 \
--batch-size $MAX_SENTENCES \
--max-tokens 4400 \
--task sentence_prediction \
--reset-optimizer --reset-dataloader --reset-meters \
--required-batch-size-multiple 1 \
--init-token 0 --separator-token 2 \
--arch roberta_large \
--criterion sentence_prediction \
--num-classes $NUM_CLASSES \
--dropout 0.1 --attention-dropout 0.1 \
--weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--max-epoch 10 \
--find-unused-parameters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric;
```
For each of the GLUE task, you will need to use following cmd-line arguments:
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
---|---|---|---|---|---|---|---|---
`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1
`--lr` | 1e-5 | 1e-5 | 1e-5 | 2e-5 | 1e-5 | 1e-5 | 1e-5 | 2e-5
`--batch-size` | 32 | 32 | 32 | 16 | 32 | 16 | 16 | 16
`--total-num-update` | 123873 | 33112 | 113272 | 2036 | 20935 | 2296 | 5336 | 3598
`--warmup-updates` | 7432 | 1986 | 28318 | 122 | 1256 | 137 | 320 | 214
For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`.
**Note:**
a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=16/32` depending on the task.
b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
c) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.
### Inference on GLUE task
After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet:
```python
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained(
'checkpoints/',
checkpoint_file='checkpoint_best.pt',
data_name_or_path='RTE-bin'
)
label_fn = lambda label: roberta.task.label_dictionary.string(
[label + roberta.task.label_dictionary.nspecial]
)
ncorrect, nsamples = 0, 0
roberta.cuda()
roberta.eval()
with open('glue_data/RTE/dev.tsv') as fin:
fin.readline()
for index, line in enumerate(fin):
tokens = line.strip().split('\t')
sent1, sent2, target = tokens[1], tokens[2], tokens[3]
tokens = roberta.encode(sent1, sent2)
prediction = roberta.predict('sentence_classification_head', tokens).argmax().item()
prediction_label = label_fn(prediction)
ncorrect += int(prediction_label == target)
nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
```
# RoBERTa: A Robustly Optimized BERT Pretraining Approach
https://arxiv.org/abs/1907.11692
## Introduction
RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details.
### What's New:
- January 2020: Italian model (UmBERTo) is available from Musixmatch Research: [UmBERTo](https://github.com/musixmatchresearch/umberto).
- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/master/examples/camembert).
- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
- September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers).
- August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers).
- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/master/examples/roberta/wsc#roberta-training-on-winogrande-dataset).
- August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
## Pre-trained models
Model | Description | # params | Download
---|---|---|---
`roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz)
`roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz)
`roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz)
`roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz)
## Results
**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)**
_(dev set, single model, single-task finetuning)_
Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B
---|---|---|---|---|---|---|---|---
`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2
`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4
`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | -
**[SuperGLUE (Wang et al., 2019)](https://super.gluebenchmark.com/)**
_(dev set, single model, single-task finetuning)_
Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC
---|---|---|---|---|---|---|---
`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | -
`roberta.large.wsc` | - | - | - | - | - | - | 91.3
**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)**
_(dev set, no additional data used)_
Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1
---|---|---
`roberta.large` | 88.9/94.6 | 86.5/89.4
**[RACE (Lai et al., 2017)](http://www.qizhexie.com/data/RACE_leaderboard.html)**
_(test set)_
Model | Accuracy | Middle | High
---|---|---|---
`roberta.large` | 83.2 | 86.5 | 81.3
**[HellaSwag (Zellers et al., 2019)](https://rowanzellers.com/hellaswag/)**
_(test set)_
Model | Overall | In-domain | Zero-shot | ActivityNet | WikiHow
---|---|---|---|---|---
`roberta.large` | 85.2 | 87.3 | 83.1 | 74.6 | 90.9
**[Commonsense QA (Talmor et al., 2019)](https://www.tau-nlp.org/commonsenseqa)**
_(test set)_
Model | Accuracy
---|---
`roberta.large` (single model) | 72.1
`roberta.large` (ensemble) | 72.5
**[Winogrande (Sakaguchi et al., 2019)](https://arxiv.org/abs/1907.10641)**
_(test set)_
Model | Accuracy
---|---
`roberta.large` | 78.1
**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)**
_(TRANSLATE-TEST)_
Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81
## Example usage
##### Load RoBERTa from torch.hub (PyTorch >= 1.1):
```python
import torch
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
roberta.eval() # disable dropout (or leave in train mode to finetune)
```
##### Load RoBERTa (for PyTorch 1.0 or custom models):
```python
# Download roberta.large model
wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz
tar -xzvf roberta.large.tar.gz
# Load the model in fairseq
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('/path/to/roberta.large', checkpoint_file='model.pt')
roberta.eval() # disable dropout (or leave in train mode to finetune)
```
##### Apply Byte-Pair Encoding (BPE) to input text:
```python
tokens = roberta.encode('Hello world!')
assert tokens.tolist() == [0, 31414, 232, 328, 2]
roberta.decode(tokens) # 'Hello world!'
```
##### Extract features from RoBERTa:
```python
# Extract the last layer's features
last_layer_features = roberta.extract_features(tokens)
assert last_layer_features.size() == torch.Size([1, 5, 1024])
# Extract all layer's features (layer 0 is the embedding layer)
all_layers = roberta.extract_features(tokens, return_all_hiddens=True)
assert len(all_layers) == 25
assert torch.all(all_layers[-1] == last_layer_features)
```
##### Use RoBERTa for sentence-pair classification tasks:
```python
# Download RoBERTa already finetuned for MNLI
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval() # disable dropout for evaluation
# Encode a pair of sentences and make a prediction
tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')
roberta.predict('mnli', tokens).argmax() # 0: contradiction
# Encode another pair of sentences
tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.')
roberta.predict('mnli', tokens).argmax() # 2: entailment
```
##### Register a new (randomly initialized) classification head:
```python
roberta.register_classification_head('new_task', num_classes=3)
logprobs = roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)
```
##### Batched prediction:
```python
import torch
from fairseq.data.data_utils import collate_tokens
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval()
batch_of_pairs = [
['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
['potatoes are awesome.', 'I like to run.'],
['Mars is very far from earth.', 'Mars is very close.'],
]
batch = collate_tokens(
[roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
)
logprobs = roberta.predict('mnli', batch)
print(logprobs.argmax(dim=1))
# tensor([0, 2, 1, 0])
```
##### Using the GPU:
```python
roberta.cuda()
roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)
```
## Advanced usage
#### Filling masks:
RoBERTa can be used to fill `<mask>` tokens in the input. Some examples from the
[Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/):
```python
roberta.fill_mask('The first Star wars movie came out in <mask>', topk=3)
# [('The first Star wars movie came out in 1977', 0.9504708051681519, ' 1977'), ('The first Star wars movie came out in 1978', 0.009986862540245056, ' 1978'), ('The first Star wars movie came out in 1979', 0.009574787691235542, ' 1979')]
roberta.fill_mask('Vikram samvat calender is official in <mask>', topk=3)
# [('Vikram samvat calender is official in India', 0.21878819167613983, ' India'), ('Vikram samvat calender is official in Delhi', 0.08547237515449524, ' Delhi'), ('Vikram samvat calender is official in Gujarat', 0.07556215673685074, ' Gujarat')]
roberta.fill_mask('<mask> is the common currency of the European Union', topk=3)
# [('Euro is the common currency of the European Union', 0.9456493854522705, 'Euro'), ('euro is the common currency of the European Union', 0.025748178362846375, 'euro'), ('€ is the common currency of the European Union', 0.011183084920048714, '€')]
```
#### Pronoun disambiguation (Winograd Schema Challenge):
RoBERTa can be used to disambiguate pronouns. First install spaCy and download the English-language model:
```bash
pip install spacy
python -m spacy download en_core_web_lg
```
Next load the `roberta.large.wsc` model and call the `disambiguate_pronoun`
function. The pronoun should be surrounded by square brackets (`[]`) and the
query referent surrounded by underscores (`_`), or left blank to return the
predicted candidate text directly:
```python
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc')
roberta.cuda() # use the GPU (optional)
roberta.disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
# True
roberta.disambiguate_pronoun('The trophy would not fit in the brown _suitcase_ because [it] was too big.')
# False
roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] feared violence.')
# 'The city councilmen'
roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] advocated violence.')
# 'demonstrators'
```
See the [RoBERTA Winograd Schema Challenge (WSC) README](wsc/README.md) for more details on how to train this model.
#### Extract features aligned to words:
By default RoBERTa outputs one feature vector per BPE token. You can instead
realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization)
with the `extract_features_aligned_to_words` method. This will compute a
weighted average of the BPE-level features for each word and expose them in
spaCy's `Token.vector` attribute:
```python
doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."')
assert len(doc) == 10
for tok in doc:
print('{:10}{} (...)'.format(str(tok), tok.vector[:5]))
# <s> tensor([-0.1316, -0.0386, -0.0832, -0.0477, 0.1943], grad_fn=<SliceBackward>) (...)
# I tensor([ 0.0559, 0.1541, -0.4832, 0.0880, 0.0120], grad_fn=<SliceBackward>) (...)
# said tensor([-0.1565, -0.0069, -0.8915, 0.0501, -0.0647], grad_fn=<SliceBackward>) (...)
# , tensor([-0.1318, -0.0387, -0.0834, -0.0477, 0.1944], grad_fn=<SliceBackward>) (...)
# " tensor([-0.0486, 0.1818, -0.3946, -0.0553, 0.0981], grad_fn=<SliceBackward>) (...)
# hello tensor([ 0.0079, 0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...)
# RoBERTa tensor([-0.2339, -0.1184, -0.7343, -0.0492, 0.5829], grad_fn=<SliceBackward>) (...)
# . tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=<SliceBackward>) (...)
# " tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=<SliceBackward>) (...)
# </s> tensor([-0.0930, -0.0392, -0.0821, 0.0158, 0.0649], grad_fn=<SliceBackward>) (...)
```
#### Evaluating the `roberta.large.mnli` model:
Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set.
```python
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta.cuda()
roberta.eval()
with open('glue_data/MNLI/dev_matched.tsv') as fin:
fin.readline()
for index, line in enumerate(fin):
tokens = line.strip().split('\t')
sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
tokens = roberta.encode(sent1, sent2)
prediction = roberta.predict('mnli', tokens).argmax().item()
prediction_label = label_map[prediction]
ncorrect += int(prediction_label == target)
nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Expected output: 0.9060
```
## Finetuning
- [Finetuning on GLUE](README.glue.md)
- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md)
- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md)
- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md)
- Finetuning on SQuAD: coming soon
## Pretraining using your own data
See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md).
## Citation
```bibtex
@article{liu2019roberta,
title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
Luke Zettlemoyer and Veselin Stoyanov},
journal={arXiv preprint arXiv:1907.11692},
year = {2019},
}
```
# Pretraining RoBERTa using your own data
This tutorial will walk you through pretraining RoBERTa over your own data.
### 1) Preprocess the data
Data should be preprocessed following the [language modeling format](/examples/language_model), i.e. each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`). Lines will be concatenated as a 1D text stream during training.
We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)
to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course
this dataset is quite small, so the resulting pretrained model will perform
poorly, but it gives the general idea.
First download the dataset:
```bash
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip
```
Next encode it with the GPT-2 BPE:
```bash
mkdir -p gpt2_bpe
wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
for SPLIT in train valid test; do \
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json gpt2_bpe/encoder.json \
--vocab-bpe gpt2_bpe/vocab.bpe \
--inputs wikitext-103-raw/wiki.${SPLIT}.raw \
--outputs wikitext-103-raw/wiki.${SPLIT}.bpe \
--keep-empty \
--workers 60; \
done
```
Finally preprocess/binarize the data using the GPT-2 fairseq dictionary:
```bash
wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
fairseq-preprocess \
--only-source \
--srcdict gpt2_bpe/dict.txt \
--trainpref wikitext-103-raw/wiki.train.bpe \
--validpref wikitext-103-raw/wiki.valid.bpe \
--testpref wikitext-103-raw/wiki.test.bpe \
--destdir data-bin/wikitext-103 \
--workers 60
```
### 2) Train RoBERTa base
```bash
TOTAL_UPDATES=125000 # Total number of training steps
WARMUP_UPDATES=10000 # Warmup the learning rate over this many updates
PEAK_LR=0.0005 # Peak learning rate, adjust as needed
TOKENS_PER_SAMPLE=512 # Max sequence length
MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)
MAX_SENTENCES=16 # Number of sequences per batch (batch size)
UPDATE_FREQ=16 # Increase the batch size 16x
DATA_DIR=data-bin/wikitext-103
fairseq-train --fp16 $DATA_DIR \
--task masked_lm --criterion masked_lm \
--arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \
--optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--batch-size $MAX_SENTENCES --update-freq $UPDATE_FREQ \
--max-update $TOTAL_UPDATES --log-format simple --log-interval 1
```
**Note:** You can optionally resume training the released RoBERTa base model by
adding `--restore-file /path/to/roberta.base/model.pt`.
**Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses
a batch size of 16 sequences (`$MAX_SENTENCES`) and accumulates gradients to
further increase the batch size by 16x (`$UPDATE_FREQ`), for a total batch size
of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need
to reduce `$MAX_SENTENCES` and increase `$UPDATE_FREQ` to compensate.
Alternatively if you have more GPUs you can decrease `$UPDATE_FREQ` accordingly
to increase training speed.
**Note:** The learning rate and batch size are tightly connected and need to be
adjusted together. We generally recommend increasing the learning rate as you
increase the batch size according to the following table (although it's also
dataset dependent, so don't rely on the following values too closely):
batch size | peak learning rate
---|---
256 | 0.0001
2048 | 0.0005
8192 | 0.0007
### 3) Load your pretrained model
```python
from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'path/to/data')
assert isinstance(roberta.model, torch.nn.Module)
```
# Finetuning RoBERTa on RACE tasks
### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/)
### 2) Preprocess RACE data:
```bash
python ./examples/roberta/preprocess_RACE.py --input-dir <input-dir> --output-dir <extracted-data-dir>
./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir>
```
### 3) Fine-tuning on RACE:
```bash
MAX_EPOCH=5 # Number of training epochs.
LR=1e-05 # Peak LR for fixed LR scheduler.
NUM_CLASSES=4
MAX_SENTENCES=1 # Batch size per GPU.
UPDATE_FREQ=8 # Accumulate gradients to simulate training on 8 GPUs.
DATA_DIR=/path/to/race-output-dir
ROBERTA_PATH=/path/to/roberta/model.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=no_c10d \
--restore-file $ROBERTA_PATH \
--reset-optimizer --reset-dataloader --reset-meters \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--task sentence_ranking \
--num-classes $NUM_CLASSES \
--init-token 0 --separator-token 2 \
--max-option-length 128 \
--max-positions 512 \
--shorten-method "truncate" \
--arch roberta_large \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--criterion sentence_ranking \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
--clip-norm 0.0 \
--lr-scheduler fixed --lr $LR \
--fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
--batch-size $MAX_SENTENCES \
--required-batch-size-multiple 1 \
--update-freq $UPDATE_FREQ \
--max-epoch $MAX_EPOCH
```
**Note:**
a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size.
b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`.
c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search.
### 4) Evaluation:
```
DATA_DIR=/path/to/race-output-dir # data directory used during training
MODEL_PATH=/path/to/checkpoint_best.pt # path to the finetuned model checkpoint
PREDS_OUT=preds.tsv # output file path to save prediction
TEST_SPLIT=test # can be test (Middle) or test1 (High)
fairseq-validate \
$DATA_DIR \
--valid-subset $TEST_SPLIT \
--path $MODEL_PATH \
--batch-size 1 \
--task sentence_ranking \
--criterion sentence_ranking \
--save-predictions $PREDS_OUT
```
# Finetuning RoBERTa on Commonsense QA
We follow a similar approach to [finetuning RACE](../README.race.md). Specifically
for each question we construct five inputs, one for each of the five candidate
answer choices. Each input is constructed by concatenating the question and
candidate answer. We then encode each input and pass the resulting "[CLS]"
representations through a fully-connected layer to predict the correct answer.
We train with a standard cross-entropy loss.
We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to
the answer. The complete input format is:
```
<s> Q: Where would I not want a fox? </s> A: hen house </s>
```
Our final submission is based on a hyperparameter search over the learning rate
(1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000,
4000) and random seed. We selected the model with the best performance on the
development set after 100 trials.
### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
```bash
bash examples/roberta/commonsense_qa/download_cqa_data.sh
```
### 2) Finetune
```bash
MAX_UPDATES=3000 # Number of training steps.
WARMUP_UPDATES=150 # Linearly increase LR over this many steps.
LR=1e-05 # Peak LR for polynomial LR scheduler.
MAX_SENTENCES=16 # Batch size.
SEED=1 # Random seed.
ROBERTA_PATH=/path/to/roberta/model.pt
DATA_DIR=data/CommonsenseQA
# we use the --user-dir option to load the task from
# the examples/roberta/commonsense_qa directory:
FAIRSEQ_PATH=/path/to/fairseq
FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
$DATA_DIR \
--user-dir $FAIRSEQ_USER_DIR \
--restore-file $ROBERTA_PATH \
--reset-optimizer --reset-dataloader --reset-meters \
--no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--task commonsense_qa --init-token 0 --bpe gpt2 \
--arch roberta_large --max-positions 512 \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--criterion sentence_ranking --num-classes 5 \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
--lr-scheduler polynomial_decay --lr $LR \
--warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
--batch-size $MAX_SENTENCES \
--max-update $MAX_UPDATES \
--log-format simple --log-interval 25 \
--seed $SEED
```
The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with
less memory, decrease `--batch-size` and increase `--update-freq`
accordingly to compensate.
### 3) Evaluate
```python
import json
import torch
from fairseq.models.roberta import RobertaModel
from examples.roberta import commonsense_qa # load the Commonsense QA task
roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA')
roberta.eval() # disable dropout
roberta.cuda() # use the GPU (optional)
nsamples, ncorrect = 0, 0
with open('data/CommonsenseQA/valid.jsonl') as h:
for line in h:
example = json.loads(line)
scores = []
for choice in example['question']['choices']:
input = roberta.encode(
'Q: ' + example['question']['stem'],
'A: ' + choice['text'],
no_separator=True
)
score = roberta.predict('sentence_classification_head', input, return_logits=True)
scores.append(score)
pred = torch.cat(scores).argmax()
answer = ord(example['answerKey']) - ord('A')
nsamples += 1
if pred == answer:
ncorrect += 1
print('Accuracy: ' + str(ncorrect / float(nsamples)))
# Accuracy: 0.7846027846027847
```
The above snippet is not batched, which makes it quite slow. See [instructions
for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta#batched-prediction).
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import commonsense_qa_task # noqa
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import json
import os
import numpy as np
import torch
from fairseq.data import (
Dictionary,
IdDataset,
ListDataset,
NestedDictionaryDataset,
NumelDataset,
NumSamplesDataset,
RawLabelDataset,
RightPadDataset,
SortDataset,
data_utils,
encoders,
)
from fairseq.tasks import LegacyFairseqTask, register_task
@register_task("commonsense_qa")
class CommonsenseQATask(LegacyFairseqTask):
"""Task to finetune RoBERTa for Commonsense QA."""
@staticmethod
def add_args(parser):
"""Add task-specific arguments to the parser."""
parser.add_argument(
"data", metavar="DIR", help="path to data directory; we load <split>.jsonl"
)
parser.add_argument(
"--init-token",
type=int,
default=None,
help="add token at the beginning of each batch item",
)
parser.add_argument("--num-classes", type=int, default=5)
def __init__(self, args, vocab):
super().__init__(args)
self.vocab = vocab
self.mask = vocab.add_symbol("<mask>")
self.bpe = encoders.build_bpe(args)
@classmethod
def load_dictionary(cls, filename):
"""Load the dictionary from the filename
Args:
filename (str): the filename
"""
dictionary = Dictionary.load(filename)
dictionary.add_symbol("<mask>")
return dictionary
@classmethod
def setup_task(cls, args, **kwargs):
assert (
args.criterion == "sentence_ranking"
), "Must set --criterion=sentence_ranking"
# load data and label dictionaries
vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
print("| dictionary: {} types".format(len(vocab)))
return cls(args, vocab)
def load_dataset(
self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
):
"""Load a given dataset split.
Args:
split (str): name of the split (e.g., train, valid, test)
"""
def binarize(s, append_bos=False):
if self.bpe is not None:
s = self.bpe.encode(s)
tokens = self.vocab.encode_line(
s,
append_eos=True,
add_if_not_exist=False,
).long()
if append_bos and self.args.init_token is not None:
tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
return tokens
if data_path is None:
data_path = os.path.join(self.args.data, split + ".jsonl")
if not os.path.exists(data_path):
raise FileNotFoundError("Cannot find data: {}".format(data_path))
src_tokens = [[] for i in range(self.args.num_classes)]
src_lengths = [[] for i in range(self.args.num_classes)]
labels = []
with open(data_path) as h:
for line in h:
example = json.loads(line.strip())
if "answerKey" in example:
label = ord(example["answerKey"]) - ord("A")
labels.append(label)
question = example["question"]["stem"]
assert len(example["question"]["choices"]) == self.args.num_classes
# format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>`
question = "Q: " + question
question_toks = binarize(question, append_bos=True)
for i, choice in enumerate(example["question"]["choices"]):
src = "A: " + choice["text"]
src_bin = torch.cat([question_toks, binarize(src)])
src_tokens[i].append(src_bin)
src_lengths[i].append(len(src_bin))
assert all(
len(src_tokens[0]) == len(src_tokens[i])
for i in range(self.args.num_classes)
)
assert len(src_tokens[0]) == len(src_lengths[0])
assert len(labels) == 0 or len(labels) == len(src_tokens[0])
for i in range(self.args.num_classes):
src_lengths[i] = np.array(src_lengths[i])
src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i])
src_lengths[i] = ListDataset(src_lengths[i])
dataset = {
"id": IdDataset(),
"nsentences": NumSamplesDataset(),
"ntokens": NumelDataset(src_tokens[0], reduce=True),
}
for i in range(self.args.num_classes):
dataset.update(
{
"net_input{}".format(i + 1): {
"src_tokens": RightPadDataset(
src_tokens[i],
pad_idx=self.source_dictionary.pad(),
),
"src_lengths": src_lengths[i],
}
}
)
if len(labels) > 0:
dataset.update({"target": RawLabelDataset(labels)})
dataset = NestedDictionaryDataset(
dataset,
sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
)
with data_utils.numpy_seed(self.args.seed):
dataset = SortDataset(
dataset,
# shuffle
sort_order=[np.random.permutation(len(dataset))],
)
print("| Loaded {} with {} samples".format(split, len(dataset)))
self.datasets[split] = dataset
return self.datasets[split]
def build_model(self, args):
from fairseq import models
model = models.build_model(args, self)
model.register_classification_head(
"sentence_classification_head",
num_classes=1,
)
return model
@property
def source_dictionary(self):
return self.vocab
@property
def target_dictionary(self):
return self.vocab
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
OUTDIR=data/CommonsenseQA
mkdir -p $OUTDIR
wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl
wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl
wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl
wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import contextlib
import sys
from collections import Counter
from multiprocessing import Pool
from fairseq.data.encoders.gpt2_bpe import get_encoder
def main():
"""
Helper script to encode raw text with the GPT-2 BPE using multiple processes.
The encoder.json and vocab.bpe files can be obtained here:
- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
- https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--encoder-json",
help="path to encoder.json",
)
parser.add_argument(
"--vocab-bpe",
type=str,
help="path to vocab.bpe",
)
parser.add_argument(
"--inputs",
nargs="+",
default=["-"],
help="input files to filter/encode",
)
parser.add_argument(
"--outputs",
nargs="+",
default=["-"],
help="path to save encoded outputs",
)
parser.add_argument(
"--keep-empty",
action="store_true",
help="keep empty lines",
)
parser.add_argument("--workers", type=int, default=20)
args = parser.parse_args()
assert len(args.inputs) == len(
args.outputs
), "number of input and output paths should match"
with contextlib.ExitStack() as stack:
inputs = [
stack.enter_context(open(input, "r", encoding="utf-8"))
if input != "-"
else sys.stdin
for input in args.inputs
]
outputs = [
stack.enter_context(open(output, "w", encoding="utf-8"))
if output != "-"
else sys.stdout
for output in args.outputs
]
encoder = MultiprocessingEncoder(args)
pool = Pool(args.workers, initializer=encoder.initializer)
encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100)
stats = Counter()
for i, (filt, enc_lines) in enumerate(encoded_lines, start=1):
if filt == "PASS":
for enc_line, output_h in zip(enc_lines, outputs):
print(enc_line, file=output_h)
else:
stats["num_filtered_" + filt] += 1
if i % 10000 == 0:
print("processed {} lines".format(i), file=sys.stderr)
for k, v in stats.most_common():
print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
class MultiprocessingEncoder(object):
def __init__(self, args):
self.args = args
def initializer(self):
global bpe
bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe)
def encode(self, line):
global bpe
ids = bpe.encode(line)
return list(map(str, ids))
def decode(self, tokens):
global bpe
return bpe.decode(tokens)
def encode_lines(self, lines):
"""
Encode a set of lines. All lines will be encoded together.
"""
enc_lines = []
for line in lines:
line = line.strip()
if len(line) == 0 and not self.args.keep_empty:
return ["EMPTY", None]
tokens = self.encode(line)
enc_lines.append(" ".join(tokens))
return ["PASS", enc_lines]
def decode_lines(self, lines):
dec_lines = []
for line in lines:
tokens = map(int, line.strip().split())
dec_lines.append(self.decode(tokens))
return ["PASS", dec_lines]
if __name__ == "__main__":
main()
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
if [[ $# -ne 2 ]]; then
echo "Run as following:"
echo "./examples/roberta/preprocess_GLUE_tasks.sh <glud_data_folder> <task_name>"
exit 1
fi
GLUE_DATA_FOLDER=$1
# download bpe encoder.json, vocabulary and fairseq dictionary
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
TASKS=$2 # QQP
if [ "$TASKS" = "ALL" ]
then
TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA"
fi
for TASK in $TASKS
do
echo "Preprocessing $TASK"
TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK"
echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER"
SPLITS="train dev test"
INPUT_COUNT=2
if [ "$TASK" = "QQP" ]
then
INPUT_COLUMNS=( 4 5 )
TEST_INPUT_COLUMNS=( 2 3 )
LABEL_COLUMN=6
elif [ "$TASK" = "MNLI" ]
then
SPLITS="train dev_matched dev_mismatched test_matched test_mismatched"
INPUT_COLUMNS=( 9 10 )
TEST_INPUT_COLUMNS=( 9 10 )
DEV_LABEL_COLUMN=16
LABEL_COLUMN=12
elif [ "$TASK" = "QNLI" ]
then
INPUT_COLUMNS=( 2 3 )
TEST_INPUT_COLUMNS=( 2 3 )
LABEL_COLUMN=4
elif [ "$TASK" = "MRPC" ]
then
INPUT_COLUMNS=( 4 5 )
TEST_INPUT_COLUMNS=( 4 5 )
LABEL_COLUMN=1
elif [ "$TASK" = "RTE" ]
then
INPUT_COLUMNS=( 2 3 )
TEST_INPUT_COLUMNS=( 2 3 )
LABEL_COLUMN=4
elif [ "$TASK" = "STS-B" ]
then
INPUT_COLUMNS=( 8 9 )
TEST_INPUT_COLUMNS=( 8 9 )
LABEL_COLUMN=10
# Following are single sentence tasks.
elif [ "$TASK" = "SST-2" ]
then
INPUT_COLUMNS=( 1 )
TEST_INPUT_COLUMNS=( 2 )
LABEL_COLUMN=2
INPUT_COUNT=1
elif [ "$TASK" = "CoLA" ]
then
INPUT_COLUMNS=( 4 )
TEST_INPUT_COLUMNS=( 2 )
LABEL_COLUMN=2
INPUT_COUNT=1
fi
# Strip out header and filter lines that don't have expected number of fields.
rm -rf "$TASK_DATA_FOLDER/processed"
mkdir -p "$TASK_DATA_FOLDER/processed"
for SPLIT in $SPLITS
do
# CoLA train and dev doesn't have header.
if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]]
then
cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
else
tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
fi
# Remove unformatted lines from train and dev files for QQP dataset.
if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]]
then
awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
else
cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
fi
rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
done
# Split into input0, input1 and label
for SPLIT in $SPLITS
do
for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
do
if [[ "$SPLIT" != test* ]]
then
COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]}
else
COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]}
fi
cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE";
done
if [[ "$SPLIT" != test* ]]
then
if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ]
then
cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
else
cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
fi
fi
# BPE encode.
for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
do
LANG="input$INPUT_TYPE"
echo "BPE encoding $SPLIT/$LANG"
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \
--outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \
--workers 60 \
--keep-empty;
done
done
# Remove output directory.
rm -rf "$TASK-bin"
DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG"
TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG"
if [ "$TASK" = "MNLI" ]
then
DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG"
TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG"
fi
# Run fairseq preprocessing:
for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
do
LANG="input$INPUT_TYPE"
fairseq-preprocess \
--only-source \
--trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \
--validpref "${DEVPREF//LANG/$LANG}" \
--testpref "${TESTPREF//LANG/$LANG}" \
--destdir "$TASK-bin/$LANG" \
--workers 60 \
--srcdict dict.txt;
done
if [[ "$TASK" != "STS-B" ]]
then
fairseq-preprocess \
--only-source \
--trainpref "$TASK_DATA_FOLDER/processed/train.label" \
--validpref "${DEVPREF//LANG/label}" \
--destdir "$TASK-bin/label" \
--workers 60;
else
# For STS-B output range is converted to be between: [0.0, 1.0]
mkdir -p "$TASK-bin/label"
awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label"
awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label"
fi
done
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import json
import os
import re
class InputExample:
def __init__(self, paragraph, qa_list, label):
self.paragraph = paragraph
self.qa_list = qa_list
self.label = label
def get_examples(data_dir, set_type):
"""
Extract paragraph and question-answer list from each json file
"""
examples = []
levels = ["middle", "high"]
set_type_c = set_type.split("-")
if len(set_type_c) == 2:
levels = [set_type_c[1]]
set_type = set_type_c[0]
for level in levels:
cur_dir = os.path.join(data_dir, set_type, level)
for filename in os.listdir(cur_dir):
cur_path = os.path.join(cur_dir, filename)
with open(cur_path, "r") as f:
cur_data = json.load(f)
answers = cur_data["answers"]
options = cur_data["options"]
questions = cur_data["questions"]
context = cur_data["article"].replace("\n", " ")
context = re.sub(r"\s+", " ", context)
for i in range(len(answers)):
label = ord(answers[i]) - ord("A")
qa_list = []
question = questions[i]
for j in range(4):
option = options[i][j]
if "_" in question:
qa_cat = question.replace("_", option)
else:
qa_cat = " ".join([question, option])
qa_cat = re.sub(r"\s+", " ", qa_cat)
qa_list.append(qa_cat)
examples.append(InputExample(context, qa_list, label))
return examples
def main():
"""
Helper script to extract paragraphs questions and answers from RACE datasets.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-dir",
help="input directory for downloaded RACE dataset",
)
parser.add_argument(
"--output-dir",
help="output directory for extracted data",
)
args = parser.parse_args()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir, exist_ok=True)
for set_type in ["train", "dev", "test-middle", "test-high"]:
examples = get_examples(args.input_dir, set_type)
qa_file_paths = [
os.path.join(args.output_dir, set_type + ".input" + str(i + 1))
for i in range(4)
]
qa_files = [open(qa_file_path, "w") for qa_file_path in qa_file_paths]
outf_context_path = os.path.join(args.output_dir, set_type + ".input0")
outf_label_path = os.path.join(args.output_dir, set_type + ".label")
outf_context = open(outf_context_path, "w")
outf_label = open(outf_label_path, "w")
for example in examples:
outf_context.write(example.paragraph + "\n")
for i in range(4):
qa_files[i].write(example.qa_list[i] + "\n")
outf_label.write(str(example.label) + "\n")
for f in qa_files:
f.close()
outf_label.close()
outf_context.close()
if __name__ == "__main__":
main()
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# data should be downloaded and processed with reprocess_RACE.py
if [[ $# -ne 2 ]]; then
echo "Run as following:"
echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>"
exit 1
fi
RACE_DATA_FOLDER=$1
OUT_DATA_FOLDER=$2
# download bpe encoder.json, vocabulary and fairseq dictionary
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
SPLITS="train dev test-middle test-high"
INPUT_TYPES="input0 input1 input2 input3 input4"
for INPUT_TYPE in $INPUT_TYPES
do
for SPLIT in $SPLITS
do
echo "BPE encoding $SPLIT/$INPUT_TYPE"
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \
--outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \
--workers 10 \
--keep-empty;
done
done
for INPUT_TYPE in $INPUT_TYPES
do
LANG="input$INPUT_TYPE"
fairseq-preprocess \
--only-source \
--trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \
--validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \
--testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \
--destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \
--workers 10 \
--srcdict dict.txt;
done
rm -rf "$OUT_DATA_FOLDER/label"
mkdir -p "$OUT_DATA_FOLDER/label"
cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/"
cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label"
cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label"
cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label"
# Finetuning RoBERTa on Winograd Schema Challenge (WSC) data
The following instructions can be used to finetune RoBERTa on the WSC training
data provided by [SuperGLUE](https://super.gluebenchmark.com/).
Note that there is high variance in the results. For our GLUE/SuperGLUE
submission we swept over the learning rate (1e-5, 2e-5, 3e-5), batch size (16,
32, 64) and total number of updates (500, 1000, 2000, 3000), as well as the
random seed. Out of ~100 runs we chose the best 7 models and ensembled them.
**Approach:** The instructions below use a slightly different loss function than
what's described in the original RoBERTa arXiv paper. In particular,
[Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin
ranking loss between `(query, candidate)` pairs with tunable hyperparameters
alpha and beta. This is supported in our code as well with the `--wsc-alpha` and
`--wsc-beta` arguments. However, we achieved slightly better (and more robust)
results on the development set by instead using a single cross entropy loss term
over the log-probabilities for the query and all mined candidates. **The
candidates are mined using spaCy from each input sentence in isolation, so the
approach remains strictly pointwise.** This reduces the number of
hyperparameters and our best model achieved 92.3% development set accuracy,
compared to ~90% accuracy for the margin loss. Later versions of the RoBERTa
arXiv paper will describe this updated formulation.
### 1) Download the WSC data from the SuperGLUE website:
```bash
wget https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip
unzip WSC.zip
# we also need to copy the RoBERTa dictionary into the same directory
wget -O WSC/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
```
### 2) Finetune over the provided training data:
```bash
TOTAL_NUM_UPDATES=2000 # Total number of training steps.
WARMUP_UPDATES=250 # Linearly increase LR over this many steps.
LR=2e-05 # Peak LR for polynomial LR scheduler.
MAX_SENTENCES=16 # Batch size per GPU.
SEED=1 # Random seed.
ROBERTA_PATH=/path/to/roberta/model.pt
# we use the --user-dir option to load the task and criterion
# from the examples/roberta/wsc directory:
FAIRSEQ_PATH=/path/to/fairseq
FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
--restore-file $ROBERTA_PATH \
--reset-optimizer --reset-dataloader --reset-meters \
--no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--valid-subset val \
--fp16 --ddp-backend no_c10d \
--user-dir $FAIRSEQ_USER_DIR \
--task wsc --criterion wsc --wsc-cross-entropy \
--arch roberta_large --bpe gpt2 --max-positions 512 \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
--lr-scheduler polynomial_decay --lr $LR \
--warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
--batch-size $MAX_SENTENCES \
--max-update $TOTAL_NUM_UPDATES \
--log-format simple --log-interval 100 \
--seed $SEED
```
The above command assumes training on 4 GPUs, but you can achieve the same
results on a single GPU by adding `--update-freq=4`.
### 3) Evaluate
```python
from fairseq.models.roberta import RobertaModel
from examples.roberta.wsc import wsc_utils # also loads WSC task and criterion
roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'WSC/')
roberta.cuda()
nsamples, ncorrect = 0, 0
for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True):
pred = roberta.disambiguate_pronoun(sentence)
nsamples += 1
if pred == label:
ncorrect += 1
print('Accuracy: ' + str(ncorrect / float(nsamples)))
# Accuracy: 0.9230769230769231
```
## RoBERTa training on WinoGrande dataset
We have also provided `winogrande` task and criterion for finetuning on the
[WinoGrande](https://mosaic.allenai.org/projects/winogrande) like datasets
where there are always two candidates and one is correct.
It's more efficient implementation for such subcases.
```bash
TOTAL_NUM_UPDATES=23750 # Total number of training steps.
WARMUP_UPDATES=2375 # Linearly increase LR over this many steps.
LR=1e-05 # Peak LR for polynomial LR scheduler.
MAX_SENTENCES=32 # Batch size per GPU.
SEED=1 # Random seed.
ROBERTA_PATH=/path/to/roberta/model.pt
# we use the --user-dir option to load the task and criterion
# from the examples/roberta/wsc directory:
FAIRSEQ_PATH=/path/to/fairseq
FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
cd fairseq
CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \
--restore-file $ROBERTA_PATH \
--reset-optimizer --reset-dataloader --reset-meters \
--no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
--best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
--valid-subset val \
--fp16 --ddp-backend no_c10d \
--user-dir $FAIRSEQ_USER_DIR \
--task winogrande --criterion winogrande \
--wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \
--arch roberta_large --bpe gpt2 --max-positions 512 \
--dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
--optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
--lr-scheduler polynomial_decay --lr $LR \
--warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
--batch-size $MAX_SENTENCES \
--max-update $TOTAL_NUM_UPDATES \
--log-format simple --log-interval 100
```
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from . import wsc_criterion # noqa
from . import wsc_task # noqa
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment