fixing xlm-roberta tokenizer max_length and automodels

1c12ee0e · thomwolf · 65c75fc5 · 1c12ee0e · 1c12ee0e · 1c12ee0e
Commit 1c12ee0e authored Dec 20, 2019 by thomwolf
5 changed files
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -20,7 +20,7 @@ import logging

 from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig)
+                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)

 from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -41,7 +41,8 @@ from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertF
 from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForMultipleChoice, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
+    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP

 from .modeling_utils import PreTrainedModel, SequenceSummary

@@ -146,6 +147,8 @@ class AutoModel(object):
            return AlbertModel(config)
        elif isinstance(config, CamembertConfig):
            return CamembertModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaModel(config)
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -333,6 +336,8 @@ class AutoModelWithLMHead(object):
            return XLMWithLMHeadModel(config)
        elif isinstance(config, CTRLConfig):
            return CTRLLMHeadModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForMaskedLM(config)
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -509,6 +514,8 @@ class AutoModelForSequenceClassification(object):
            return XLNetForSequenceClassification(config)
        elif isinstance(config, XLMConfig):
            return XLMForSequenceClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForSequenceClassification(config)
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -787,6 +794,8 @@ class AutoModelForTokenClassification:
            return XLNetForTokenClassification(config)
        elif isinstance(config, RobertaConfig):
            return RobertaForTokenClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForTokenClassification(config)
        raise ValueError("Unrecognized configuration class {}".format(config))
    
    @classmethod
@@ -865,6 +874,8 @@ class AutoModelForTokenClassification:
            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -873,4 +884,4 @@ class AutoModelForTokenClassification:
            return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'camembert', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
+                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -415,7 +415,7 @@ class PreTrainedModel(nn.Module):
                state_dict = torch.load(resolved_archive_file, map_location='cpu')
            except:
                raise OSError("Unable to load weights from pytorch checkpoint file. "
-                            "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")
+                              "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")

        missing_keys = []
        unexpected_keys = []

--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -49,7 +49,7 @@ logger = logging.getLogger(__name__)

 def get_framework(model=None):
    """ Select framework (TensorFlow/PyTorch) to use.
-        If both frameworks are installed and no specific model is provided, defaults to using TensorFlow.
+        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
    """
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
        # Both framework are available but the use supplied a model class instance.
@@ -60,7 +60,8 @@ def get_framework(model=None):
                          "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
                          "To install PyTorch, read the instructions at https://pytorch.org/.")
    else:
-        framework = 'tf' if is_tf_available() else 'pt'
+        # framework = 'tf' if is_tf_available() else 'pt'
+        framework = 'pt' if is_torch_available() else 'tf'
    return framework

 class ArgumentHandler(ABC):

--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -434,7 +434,11 @@ class PreTrainedTokenizer(object):
                    init_kwargs[key] = value

        # Instantiate tokenizer.
-        tokenizer = cls(*init_inputs, **init_kwargs)
+        try:
+            tokenizer = cls(*init_inputs, **init_kwargs)
+        except OSError:
+            OSError("Unable to load vocabulary from file. "
+                    "Please check that the provided vocabulary is accessible and not corrupted.")

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
        tokenizer.init_inputs = init_inputs

--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -40,8 +40,12 @@ PRETRAINED_VOCAB_FILES_MAP = {
 }

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlm-roberta-base': None,
-    'xlm-roberta-large': None,
+    'xlm-roberta-base': 512,
+    'xlm-roberta-large': 512,
+    'xlm-roberta-large-finetuned-conll02-dutch': 512,
+    'xlm-roberta-large-finetuned-conll02-spanish': 512,
+    'xlm-roberta-large-finetuned-conll03-english': 512,
+    'xlm-roberta-large-finetuned-conll03-german': 512,
 }

 class XLMRobertaTokenizer(PreTrainedTokenizer):
@@ -58,10 +62,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
                 **kwargs):
-        super(XLMRobertaTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                                 sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                                 mask_token=mask_token,
-                                                 **kwargs)
+        super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                                  sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                                  mask_token=mask_token,
+                                                  **kwargs)
        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
        self.sp_model = spm.SentencePieceProcessor()