Merge branch 'master' into t5

0558c9cb · thomwolf · 608a8f5b · e57d00ee · 0558c9cb · 0558c9cb
Commit 0558c9cb authored Dec 10, 2019 by thomwolf
8 changed files
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -46,6 +46,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+        'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
    },
    'merges_file':
@@ -53,6 +54,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+        'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
    },
 }
@@ -61,6 +63,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'gpt2': 1024,
    'gpt2-medium': 1024,
    'gpt2-large': 1024,
+    'gpt2-xl': 1024,
    'distilgpt2': 1024,
 }

@@ -69,7 +72,7 @@ def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings.
    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
-    
+
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
@@ -104,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
    """
    GPT-2 BPE tokenizer. Peculiarities:
        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding methods should be called with the
+        - Requires a space to start the input string => the encoding and tokenize methods should be called with the
          ``add_prefix_space`` flag set to ``True``.
-          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
+          Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
+          the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -119,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens

-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            bpe_merges = merges_handle.read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

@@ -181,7 +186,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        """ Tokenize a string.
            Args:
                - add_prefix_space (boolean, default False):
-                    Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
+                    Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
        """
        if add_prefix_space:
            text = ' ' + text
@@ -231,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                writer.write(' '.join(bpe_tokens) + u'\n')
                index += 1

-        return vocab_file, merge_file
\ No newline at end of file
+        return vocab_file, merge_file
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
            self.nlp = BasicTokenizer(do_lower_case=True)
            self.fix_text = None

-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
        self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[1:-1]
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -47,6 +47,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
+        'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
    },
    'merges_file':
    {
@@ -54,6 +56,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
+        'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
    },
 }

@@ -62,6 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'roberta-large': 512,
    'roberta-large-mnli': 512,
    'distilroberta-base': 512,
+    'roberta-base-openai-detector': 512,
+    'roberta-large-openai-detector': 512,
 }


@@ -114,7 +120,7 @@ class RobertaTokenizer(GPT2Tokenizer):
                special tokens for the model

        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:

--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -21,6 +21,8 @@ import os
 import json
 import six
 import copy
+import itertools
+import re
 from io import open

 from .file_utils import cached_path, is_tf_available, is_torch_available
@@ -76,6 +78,8 @@ class PreTrainedTokenizer(object):
                                 "pad_token", "cls_token", "mask_token",
                                 "additional_special_tokens"]

+    padding_side = "right"
+
    @property
    def bos_token(self):
        """ Beginning of sentence token (string). Log an error if used while not having been set. """
@@ -189,6 +193,11 @@ class PreTrainedTokenizer(object):
        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
        return self.convert_tokens_to_ids(self.pad_token)

+    @property
+    def pad_token_type_id(self):
+        """ Id of the padding token type in the vocabulary."""
+        return self._pad_token_type_id
+
    @property
    def cls_token_id(self):
        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
@@ -212,10 +221,14 @@ class PreTrainedTokenizer(object):
        self._pad_token = None
        self._cls_token = None
        self._mask_token = None
+        self._pad_token_type_id = 0
        self._additional_special_tokens = []

        self.max_len = max_len if max_len is not None else int(1e12)

+        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        self.padding_side = kwargs.pop('padding_side', self.padding_side)
+        
        # Added tokens
        self.added_tokens_encoder = {}
        self.added_tokens_decoder = {}
@@ -251,6 +264,9 @@ class PreTrainedTokenizer(object):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the vocabulary files and override the cached versions if they exists.

+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -286,6 +302,7 @@ class PreTrainedTokenizer(object):
    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        cache_dir = kwargs.pop('cache_dir', None)
        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)

        s3_models = list(cls.max_model_input_sizes.keys())
@@ -342,7 +359,7 @@ class PreTrainedTokenizer(object):
                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
                    "named {} but couldn't find such vocabulary files at this path or url.".format(
                        pretrained_model_name_or_path, ', '.join(s3_models),
-                        pretrained_model_name_or_path, 
+                        pretrained_model_name_or_path,
                        list(cls.vocab_files_names.values())))

        # Get files from url, cache, or disk depending on the case
@@ -352,7 +369,7 @@ class PreTrainedTokenizer(object):
                if file_path is None:
                    resolved_vocab_files[file_id] = None
                else:
-                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
        except EnvironmentError:
            if pretrained_model_name_or_path in s3_models:
                msg = "Couldn't reach server at '{}' to download vocabulary files."
@@ -377,7 +394,8 @@ class PreTrainedTokenizer(object):
        # Did we saved some inputs and kwargs to reload ?
        tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
        if tokenizer_config_file is not None:
-            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+                init_kwargs = json.load(tokenizer_config_handle)
            saved_init_inputs = init_kwargs.pop('init_inputs', ())
            if not init_inputs:
                init_inputs = saved_init_inputs
@@ -402,7 +420,8 @@ class PreTrainedTokenizer(object):
            if args_name not in init_kwargs:
                init_kwargs[args_name] = file_path
        if special_tokens_map_file is not None:
-            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                special_tokens_map = json.load(special_tokens_map_handle)
            for key, value in special_tokens_map.items():
                if key not in init_kwargs:
                    init_kwargs[key] = value
@@ -416,7 +435,8 @@ class PreTrainedTokenizer(object):

        # Add supplementary tokens.
        if added_tokens_file is not None:
-            added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
            added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
            tokenizer.added_tokens_encoder.update(added_tok_encoder)
            tokenizer.added_tokens_decoder.update(added_tok_decoder)
@@ -512,6 +532,8 @@ class PreTrainedTokenizer(object):
        to_add_tokens = []
        for token in new_tokens:
            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
+            if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
+                token = token.lower()
            if token != self.unk_token and \
                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
                    token not in to_add_tokens:
@@ -604,7 +626,24 @@ class PreTrainedTokenizer(object):
            vocabularies (BPE/SentencePieces/WordPieces).

            Take care of added tokens.
+
+            text: The sequence to be encoded.
+            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
+            **kwargs: passed to the child `self.tokenize()` method
        """
+        def lowercase_text(t):
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
+            pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
+                      r'(.+?)'
+            return re.sub(
+                pattern,
+                lambda m: m.groups()[0] or m.groups()[1].lower(),
+                t)
+
+        if self.init_kwargs.get('do_lower_case', False):
+            text = lowercase_text(text)
+
        def split_on_token(tok, text):
            result = []
            split_text = text.split(tok)
@@ -641,9 +680,9 @@ class PreTrainedTokenizer(object):
                        tokenized_text += [sub_text]
                text_list = tokenized_text

-            return sum((self._tokenize(token, **kwargs) if token not \
+            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
                    in self.added_tokens_encoder and token not in self.all_special_tokens \
-                    else [token] for token in tokenized_text), [])
+                    else [token] for token in tokenized_text)))

        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
        tokenized_text = split_on_tokens(added_tokens, text)
@@ -671,10 +710,6 @@ class PreTrainedTokenizer(object):
        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
-        if len(ids) > self.max_len:
-            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
-                           "for this model ({} > {}). Running this sequence through the model will result in "
-                           "indexing errors".format(len(ids), self.max_len))
        return ids

    def _convert_token_to_id_with_added_voc(self, token):
@@ -695,6 +730,7 @@ class PreTrainedTokenizer(object):
               max_length=None,
               stride=0,
               truncation_strategy='longest_first',
+               pad_to_max_length=False,
               return_tensors=None,
               **kwargs):
        """
@@ -721,6 +757,12 @@ class PreTrainedTokenizer(object):
                - 'only_first': Only truncate the first sequence
                - 'only_second': Only truncate the second sequence
                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences   
+                Defaults to False: no padding.
            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                or PyTorch torch.Tensor instead of a list of python integers.
            **kwargs: passed to the `self.tokenize()` method
@@ -731,6 +773,7 @@ class PreTrainedTokenizer(object):
                                          add_special_tokens=add_special_tokens,
                                          stride=stride,
                                          truncation_strategy=truncation_strategy,
+                                          pad_to_max_length=pad_to_max_length,
                                          return_tensors=return_tensors,
                                          **kwargs)

@@ -743,7 +786,12 @@ class PreTrainedTokenizer(object):
                    max_length=None,
                    stride=0,
                    truncation_strategy='longest_first',
+                    pad_to_max_length=False,
                    return_tensors=None,
+                    return_token_type_ids=True,
+                    return_attention_mask=True,
+                    return_overflowing_tokens=False,
+                    return_special_tokens_mask=False,
                    **kwargs):
        """
        Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
@@ -768,9 +816,40 @@ class PreTrainedTokenizer(object):
                - 'only_first': Only truncate the first sequence
                - 'only_second': Only truncate the second sequence
                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences   
+                Defaults to False: no padding.
            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                or PyTorch torch.Tensor instead of a list of python integers.
+            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            A Dictionary of shape::
+
+                {
+                    input_ids: list[int],
+                    token_type_ids: list[int] if return_token_type_ids is True (default)
+                    attention_mask: list[int] if return_attention_mask is True (default)
+                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+                }
+
+            With the fields:
+                ``input_ids``: list of token ids to be fed to a model
+                ``token_type_ids``: list of token type ids to be fed to a model
+                ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+                tokens and 1 specifying sequence tokens.
        """

        def get_input_ids(text):
@@ -789,13 +868,24 @@ class PreTrainedTokenizer(object):
        return self.prepare_for_model(first_ids,
                                      pair_ids=second_ids,
                                      max_length=max_length,
+                                      pad_to_max_length=pad_to_max_length,
                                      add_special_tokens=add_special_tokens,
                                      stride=stride,
                                      truncation_strategy=truncation_strategy,
-                                      return_tensors=return_tensors)
+                                      return_tensors=return_tensors,
+                                      return_attention_mask=return_attention_mask,
+                                      return_token_type_ids=return_token_type_ids,
+                                      return_overflowing_tokens=return_overflowing_tokens,
+                                      return_special_tokens_mask=return_special_tokens_mask)

    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
-                          truncation_strategy='longest_first', return_tensors=None):
+                          truncation_strategy='longest_first',
+                          pad_to_max_length=False,
+                          return_tensors=None,
+                          return_token_type_ids=True,
+                          return_attention_mask=True,
+                          return_overflowing_tokens=False,
+                          return_special_tokens_mask=False):
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
        It adds special tokens, truncates
@@ -818,23 +908,36 @@ class PreTrainedTokenizer(object):
                - 'only_first': Only truncate the first sequence
                - 'only_second': Only truncate the second sequence
                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences   
+                Defaults to False: no padding.
            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                or PyTorch torch.Tensor instead of a list of python integers.
+            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).

        Return:
            A Dictionary of shape::

                {
                    input_ids: list[int],
-                    overflowing_tokens: list[int] if a ``max_length`` is specified, else None
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True``
+                    token_type_ids: list[int] if return_token_type_ids is True (default)
+                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
                }

            With the fields:
-                ``input_ids``: list of tokens to be fed to a model
+                ``input_ids``: list of token ids to be fed to a model
+                ``token_type_ids``: list of token type ids to be fed to a model

                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-
+                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
                tokens and 1 specifying sequence tokens.
        """
@@ -843,23 +946,31 @@ class PreTrainedTokenizer(object):
        len_pair_ids = len(pair_ids) if pair else 0

        encoded_inputs = {}
+
+        # Handle max sequence length
        total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
        if max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
                                                                        num_tokens_to_remove=total_len-max_length,
                                                                        truncation_strategy=truncation_strategy,
                                                                        stride=stride)
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+            if return_overflowing_tokens:
+                encoded_inputs["overflowing_tokens"] = overflowing_tokens
+                encoded_inputs["num_truncated_tokens"] = total_len - max_length

+        # Handle special_tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
+            special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
+        if return_special_tokens_mask:
+            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)

+        # Prepare inputs as tensors if asked
        if return_tensors == 'tf' and is_tf_available():
            sequence = tf.constant([sequence])
            token_type_ids = tf.constant([token_type_ids])
@@ -870,13 +981,57 @@ class PreTrainedTokenizer(object):
            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))

        encoded_inputs["input_ids"] = sequence
-        encoded_inputs["token_type_ids"] = token_type_ids
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids

        if max_length and len(encoded_inputs["input_ids"]) > max_length:
            encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
-            encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
-            encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
+            if return_token_type_ids:
+                encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
+            if return_special_tokens_mask:
+                encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]

+        if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
+            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
+                           "for this model ({} > {}). Running this sequence through the model will result in "
+                           "indexing errors".format(len(ids), self.max_len))
+                           
+        needs_to_be_padded = pad_to_max_length and (
+            max_length and len(encoded_inputs["input_ids"]) < max_length
+            or 
+            max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
+        )
+
+        if pad_to_max_length and max_length is None and self.max_len > 10000:
+            logger.warning("Sequence can't be padded as the maximum  ")
+
+        if needs_to_be_padded:
+            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
+
+            if self.padding_side == 'right':
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+
+            elif self.padding_side == 'left':
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+            
+        elif return_attention_mask:
+            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+            
        return encoded_inputs

    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
@@ -920,7 +1075,6 @@ class PreTrainedTokenizer(object):
        return (ids, pair_ids, overflowing_tokens)

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        logger.warning("This tokenizer does not make use of special tokens.")
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
@@ -933,7 +1087,6 @@ class PreTrainedTokenizer(object):
            single sequence: <s> X </s>
            pair of sequences: <s> A </s></s> B </s>
        """
-        logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
        if token_ids_1 is None:
            return token_ids_0
        return token_ids_0 + token_ids_1
@@ -951,7 +1104,7 @@ class PreTrainedTokenizer(object):
                special tokens for the model

        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))

@@ -1055,7 +1208,7 @@ class PreTrainedTokenizer(object):
            class attributes (cls_token, unk_token...).
        """
        all_toks = self.all_special_tokens
-        all_ids = list(self._convert_token_to_id(t) for t in all_toks)
+        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids

    @staticmethod

--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
+"""Tokenization classes for XLM."""
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

@@ -524,7 +524,7 @@ class XLMTokenizer(PreTrainedTokenizer):

        - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
        (ex: "__classify__") to a vocabulary
-        
+
        - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)

        - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
@@ -564,9 +564,11 @@ class XLMTokenizer(PreTrainedTokenizer):
        self.ja_word_tokenizer = None
        self.zh_word_tokenizer = None

-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
        self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[:-1]
        merges = [tuple(merge.split()[:2]) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}
@@ -758,9 +760,9 @@ class XLMTokenizer(PreTrainedTokenizer):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
+        A XLM sequence has the following format:
            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+            pair of sequences: <s> A </s> B </s>
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -781,7 +783,7 @@ class XLMTokenizer(PreTrainedTokenizer):
                special tokens for the model

        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:

--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    padding_side = "left"

    def __init__(self, vocab_file,
                 do_lower_case=False, remove_space=True, keep_accents=False,
@@ -74,6 +75,7 @@ class XLNetTokenizer(PreTrainedTokenizer):

        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+        self._pad_token_type_id = 3

        try:
            import sentencepiece as spm
@@ -141,7 +143,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(
                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
@@ -185,9 +187,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
+        An XLNet sequence has the following format:
+            single sequence: X <sep> <cls>
+            pair of sequences: A <sep> B <sep> <cls>
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
@@ -208,7 +210,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
                special tokens for the model

        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        if already_has_special_tokens:
@@ -224,10 +226,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
+        An XLNet sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
        | first sequence    | second sequence     | CLS segment ID
-        
+
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]

--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
+''' Script for downloading all GLUE data.
+Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
+
+Note: for legal reasons, we are unable to host MRPC.
+You can either use the version hosted by the SentEval team, which is already tokenized, 
+or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
+For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
+You should then rename and place specific files in a folder (see below for an example).
+
+mkdir MRPC
+cabextract MSRParaphraseCorpus.msi -d MRPC
+cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
+cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
+rm MRPC/_*
+rm MSRParaphraseCorpus.msi
+
+1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
+2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
+'''
+
+import os
+import sys
+import shutil
+import argparse
+import tempfile
+import urllib.request
+import zipfile
+
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
+             "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
+             "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
+             "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
+             "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
+             "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
+             "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
+             "QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
+             "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
+             "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
+             "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
+
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+
+def download_and_extract(task, data_dir):
+    print("Downloading and extracting %s..." % task)
+    data_file = "%s.zip" % task
+    urllib.request.urlretrieve(TASK2PATH[task], data_file)
+    with zipfile.ZipFile(data_file) as zip_ref:
+        zip_ref.extractall(data_dir)
+    os.remove(data_file)
+    print("\tCompleted!")
+
+def format_mrpc(data_dir, path_to_data):
+    print("Processing MRPC...")
+    mrpc_dir = os.path.join(data_dir, "MRPC")
+    if not os.path.isdir(mrpc_dir):
+        os.mkdir(mrpc_dir)
+    if path_to_data:
+        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
+    else:
+        print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
+        mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+        urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+        urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+    urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+
+    dev_ids = []
+    with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split('\t'))
+
+    with open(mrpc_train_file, encoding="utf8") as data_fh, \
+         open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
+         open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+
+    with open(mrpc_test_file, encoding="utf8") as data_fh, \
+            open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+    print("\tCompleted!")
+
+def download_diagnostic(data_dir):
+    print("Downloading and extracting diagnostic...")
+    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
+        os.mkdir(os.path.join(data_dir, "diagnostic"))
+    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
+    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
+    print("\tCompleted!")
+    return
+
+def get_tasks(task_names):
+    task_names = task_names.split(',')
+    if "all" in task_names:
+        tasks = TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            assert task_name in TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+    return tasks
+
+def main(arguments):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
+                        type=str, default='all')
+    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
+                        type=str, default='')
+    args = parser.parse_args(arguments)
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+    tasks = get_tasks(args.tasks)
+
+    for task in tasks:
+        if task == 'MRPC':
+            format_mrpc(args.data_dir, args.path_to_mrpc)
+        elif task == 'diagnostic':
+            download_diagnostic(args.data_dir)
+        else:
+            download_and_extract(task, args.data_dir)
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
--- a/valohai.yaml
+++ b/valohai.yaml
+---
+
+- step:
+    name: Execute python examples/run_glue.py
+    image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
+    command:
+      - python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data
+      - pip install -e .
+      - pip install -r examples/requirements.txt
+      - python examples/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters}
+    parameters:
+      - name: model_type
+        pass-as: --model_type={v}
+        type: string
+        default: bert
+      - name: model_name_or_path
+        pass-as: --model_name_or_path={v}
+        type: string
+        default: bert-base-uncased
+      - name: task_name
+        pass-as: --task_name={v}
+        type: string
+        default: MRPC
+      - name: max_seq_length
+        pass-as: --max_seq_length={v}
+        description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
+        type: integer
+        default: 128
+      - name: per_gpu_train_batch_size
+        pass-as: --per_gpu_train_batch_size={v}
+        description: Batch size per GPU/CPU for training.
+        type: integer
+        default: 8
+      - name: per_gpu_eval_batch_size
+        pass-as: --per_gpu_eval_batch_size={v}
+        description: Batch size per GPU/CPU for evaluation.
+        type: integer
+        default: 8
+      - name: gradient_accumulation_steps
+        pass-as: --gradient_accumulation_steps={v}
+        description: Number of updates steps to accumulate before performing a backward/update pass.
+        type: integer
+        default: 1
+      - name: learning_rate
+        pass-as: --learning_rate={v}
+        description: The initial learning rate for Adam.
+        type: float
+        default: 0.00005
+      - name: adam_epsilon
+        pass-as: --adam_epsilon={v}
+        description: Epsilon for Adam optimizer.
+        type: float
+        default: 0.00000001
+      - name: max_grad_norm
+        pass-as: --max_grad_norm={v}
+        description: Max gradient norm.
+        type: float
+        default: 1.0
+      - name: num_train_epochs
+        pass-as: --num_train_epochs={v}
+        description: Total number of training epochs to perform.
+        type: integer
+        default: 3
+      - name: max_steps
+        pass-as: --max_steps={v}
+        description: If > 0, set total number of training steps to perform. Override num_train_epochs.
+        type: integer
+        default: -1
+      - name: warmup_steps
+        pass-as: --warmup_steps={v}
+        description: Linear warmup over warmup_steps.
+        type: integer
+        default: -1
+      - name: logging_steps
+        pass-as: --logging_steps={v}
+        description: Log every X updates steps.
+        type: integer
+        default: 25
+      - name: save_steps
+        pass-as: --save_steps={v}
+        description: Save checkpoint every X updates steps.
+        type: integer
+        default: -1
+      - name: output_dir
+        pass-as: --output_dir={v}
+        type: string
+        default: /valohai/outputs
+      - name: evaluate_during_training
+        description: Run evaluation during training at each logging step.
+        type: flag
+        default: true
+      - name: do_lower_case
+        description: Set this flag if you are using an uncased model.
+        type: flag