Merge pull request #2243 from huggingface/fix-xlm-roberta

fixing xlm-roberta tokenizer max_length and automodels

Merge pull request #2243 from huggingface/fix-xlm-roberta
fixing xlm-roberta tokenizer max_length and automodels
8d5a47c7 · Thomas Wolf · GitHub · 65c75fc5 · 79e4a6a2 · 8d5a47c7
Unverified Commit 8d5a47c7 authored Dec 20, 2019 by Thomas Wolf Committed by GitHub Dec 20, 2019
7 changed files
--- a/transformers/commands/run.py
+++ b/transformers/commands/run.py
@@ -9,6 +9,9 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


 def try_infer_format_from_ext(path: str):
+    if not path:
+        return 'pipe'
+
    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
        if path.endswith(ext):
            return ext
@@ -20,9 +23,16 @@ def try_infer_format_from_ext(path: str):


 def run_command_factory(args):
-    nlp = pipeline(task=args.task, model=args.model, config=args.config, tokenizer=args.tokenizer, device=args.device)
+    nlp = pipeline(task=args.task,
+                   model=args.model if args.model else None,
+                   config=args.config,
+                   tokenizer=args.tokenizer,
+                   device=args.device)
    format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format
-    reader = PipelineDataFormat.from_str(format, args.output, args.input, args.column)
+    reader = PipelineDataFormat.from_str(format=format,
+                                         output_path=args.output,
+                                         input_path=args.input,
+                                         column=args.column if args.column else nlp.default_input_names)
    return RunCommand(nlp, reader)


@@ -35,24 +45,26 @@ class RunCommand(BaseTransformersCLICommand):
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
        run_parser = parser.add_parser('run', help="Run a pipeline through the CLI")
-        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
        run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run')
-        run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.')
+        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
+        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
+        run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.')
        run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.')
        run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)')
        run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)')
        run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from')
-        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
-        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
+        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
        run_parser.set_defaults(func=run_command_factory)

    def run(self):
-        nlp, output = self._nlp, []
+        nlp, outputs = self._nlp, []
+
        for entry in self._reader:
-            if self._reader.is_multi_columns:
-                output += nlp(**entry)
+            output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
+            if isinstance(output, dict):
+                outputs.append(output)
            else:
-                output += nlp(entry)
+                outputs += output

        # Saving data
        if self._nlp.binary_output:

--- a/transformers/commands/serving.py
+++ b/transformers/commands/serving.py
@@ -24,7 +24,11 @@ def serve_command_factory(args: Namespace):
    Factory function used to instantiate serving server from provided command line arguments.
    :return: ServeCommand
    """
-    nlp = pipeline(task=args.task, model=args.model, config=args.config, tokenizer=args.tokenizer, device=args.device)
+    nlp = pipeline(task=args.task,
+                   model=args.model if args.model else None,
+                   config=args.config,
+                   tokenizer=args.tokenizer,
+                   device=args.device)
    return ServeCommand(nlp, args.host, args.port)


@@ -68,12 +72,12 @@ class ServeCommand(BaseTransformersCLICommand):
        """
        serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.')
        serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on')
-        serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
        serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.')
        serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.')
-        serve_parser.add_argument('--model', type=str, required=True, help='Model\'s name or path to stored model.')
+        serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.')
        serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.')
        serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.')
+        serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
        serve_parser.set_defaults(func=serve_command_factory)

    def __init__(self, pipeline: Pipeline, host: str, port: int):

--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -20,7 +20,7 @@ import logging

 from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
-                                 TransfoXLConfig, XLMConfig, XLNetConfig)
+                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)

 from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -41,7 +41,8 @@ from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertF
 from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForMultipleChoice, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
+    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP

 from .modeling_utils import PreTrainedModel, SequenceSummary

@@ -146,6 +147,8 @@ class AutoModel(object):
            return AlbertModel(config)
        elif isinstance(config, CamembertConfig):
            return CamembertModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaModel(config)
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -333,6 +336,8 @@ class AutoModelWithLMHead(object):
            return XLMWithLMHeadModel(config)
        elif isinstance(config, CTRLConfig):
            return CTRLLMHeadModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForMaskedLM(config)
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -509,6 +514,8 @@ class AutoModelForSequenceClassification(object):
            return XLNetForSequenceClassification(config)
        elif isinstance(config, XLMConfig):
            return XLMForSequenceClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForSequenceClassification(config)
        raise ValueError("Unrecognized configuration class {}".format(config))

    @classmethod
@@ -787,6 +794,8 @@ class AutoModelForTokenClassification:
            return XLNetForTokenClassification(config)
        elif isinstance(config, RobertaConfig):
            return RobertaForTokenClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForTokenClassification(config)
        raise ValueError("Unrecognized configuration class {}".format(config))
    
    @classmethod
@@ -865,6 +874,8 @@ class AutoModelForTokenClassification:
            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
@@ -873,4 +884,4 @@ class AutoModelForTokenClassification:
            return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'camembert', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
+                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -415,7 +415,7 @@ class PreTrainedModel(nn.Module):
                state_dict = torch.load(resolved_archive_file, map_location='cpu')
            except:
                raise OSError("Unable to load weights from pytorch checkpoint file. "
-                            "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")
+                              "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")

        missing_keys = []
        unexpected_keys = []

--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
@@ -14,12 +14,14 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals

+import sys
 import csv
 import json
 import os
 import pickle
 import logging
 import six
+
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from itertools import groupby
@@ -49,7 +51,7 @@ logger = logging.getLogger(__name__)

 def get_framework(model=None):
    """ Select framework (TensorFlow/PyTorch) to use.
-        If both frameworks are installed and no specific model is provided, defaults to using TensorFlow.
+        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
    """
    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
        # Both framework are available but the use supplied a model class instance.
@@ -60,7 +62,8 @@ def get_framework(model=None):
                          "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
                          "To install PyTorch, read the instructions at https://pytorch.org/.")
    else:
-        framework = 'tf' if is_tf_available() else 'pt'
+        # framework = 'tf' if is_tf_available() else 'pt'
+        framework = 'pt' if is_torch_available() else 'tf'
    return framework

 class ArgumentHandler(ABC):
@@ -97,28 +100,29 @@ class PipelineDataFormat:
    Supported data formats currently includes:
     - JSON
     - CSV
+     - stdin/stdout (pipe)

    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
    """
    SUPPORTED_FORMATS = ['json', 'csv', 'pipe']

-    def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]):
-        self.output = output
-        self.path = input
-        self.column = column.split(',') if column else ['']
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]):
+        self.output_path = output_path
+        self.input_path = input_path
+        self.column = column.split(',') if column is not None else ['']
        self.is_multi_columns = len(self.column) > 1

        if self.is_multi_columns:
            self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column]

-        if output is not None:
-            if exists(abspath(self.output)):
-                raise OSError('{} already exists on disk'.format(self.output))
+        if output_path is not None:
+            if exists(abspath(self.output_path)):
+                raise OSError('{} already exists on disk'.format(self.output_path))

-        if input is not None:
-            if not exists(abspath(self.path)):
-                raise OSError('{} doesnt exist on disk'.format(self.path))
+        if input_path is not None:
+            if not exists(abspath(self.input_path)):
+                raise OSError('{} doesnt exist on disk'.format(self.input_path))

    @abstractmethod
    def __iter__(self):
@@ -139,7 +143,7 @@ class PipelineDataFormat:
        :param data: data to store
        :return: (str) Path where the data has been saved
        """
-        path, _ = os.path.splitext(self.output)
+        path, _ = os.path.splitext(self.output_path)
        binary_path = os.path.extsep.join((path, 'pickle'))

        with open(binary_path, 'wb+') as f_output:
@@ -148,23 +152,23 @@ class PipelineDataFormat:
        return binary_path

    @staticmethod
-    def from_str(name: str, output: Optional[str], path: Optional[str], column: Optional[str]):
-        if name == 'json':
-            return JsonPipelineDataFormat(output, path, column)
-        elif name == 'csv':
-            return CsvPipelineDataFormat(output, path, column)
-        elif name == 'pipe':
-            return PipedPipelineDataFormat(output, path, column)
+    def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str]):
+        if format == 'json':
+            return JsonPipelineDataFormat(output_path, input_path, column)
+        elif format == 'csv':
+            return CsvPipelineDataFormat(output_path, input_path, column)
+        elif format == 'pipe':
+            return PipedPipelineDataFormat(output_path, input_path, column)
        else:
-            raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(name))
+            raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format))


 class CsvPipelineDataFormat(PipelineDataFormat):
-    def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]):
-        super().__init__(output, input, column)
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]):
+        super().__init__(output_path, input_path, column)

    def __iter__(self):
-        with open(self.path, 'r') as f:
+        with open(self.input_path, 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                if self.is_multi_columns:
@@ -173,7 +177,7 @@ class CsvPipelineDataFormat(PipelineDataFormat):
                    yield row[self.column[0]]

    def save(self, data: List[dict]):
-        with open(self.output, 'w') as f:
+        with open(self.output_path, 'w') as f:
            if len(data) > 0:
                writer = csv.DictWriter(f, list(data[0].keys()))
                writer.writeheader()
@@ -181,10 +185,10 @@ class CsvPipelineDataFormat(PipelineDataFormat):


 class JsonPipelineDataFormat(PipelineDataFormat):
-    def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]):
-        super().__init__(output, input, column)
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]):
+        super().__init__(output_path, input_path, column)

-        with open(input, 'r') as f:
+        with open(input_path, 'r') as f:
            self._entries = json.load(f)

    def __iter__(self):
@@ -195,7 +199,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
                yield entry[self.column[0]]

    def save(self, data: dict):
-        with open(self.output, 'w') as f:
+        with open(self.output_path, 'w') as f:
            json.dump(data, f)


@@ -207,9 +211,7 @@ class PipedPipelineDataFormat(PipelineDataFormat):
    If columns are provided, then the output will be a dictionary with {column_x: value_x}
    """
    def __iter__(self):
-        import sys
        for line in sys.stdin:
-
            # Split for multi-columns
            if '\t' in line:

@@ -228,7 +230,7 @@ class PipedPipelineDataFormat(PipelineDataFormat):
        print(data)

    def save_binary(self, data: Union[dict, List[dict]]) -> str:
-        if self.output is None:
+        if self.output_path is None:
            raise KeyError(
                'When using piped input on pipeline outputting large object requires an output file path. '
                'Please provide such output path through --output argument.'
@@ -293,6 +295,9 @@ class Pipeline(_ScikitCompat):
        nlp = NerPipeline(model='...', config='...', tokenizer='...')
        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
    """
+
+    default_input_names = None
+
    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
                 modelcard: ModelCard = None, framework: Optional[str] = None,
                 args_parser: ArgumentHandler = None, device: int = -1,
@@ -581,6 +586,8 @@ class QuestionAnsweringPipeline(Pipeline):
    Question Answering pipeline using ModelForQuestionAnswering head.
    """

+    default_input_names = 'question,context'
+
    def __init__(self, model,
                 tokenizer: Optional[PreTrainedTokenizer],
                 modelcard: Optional[ModelCard],
@@ -683,7 +690,6 @@ class QuestionAnsweringPipeline(Pipeline):
                }
                for s, e, score in zip(starts, ends, scores)
            ]
-
        if len(answers) == 1:
            return answers[0]
        return answers

--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -434,7 +434,11 @@ class PreTrainedTokenizer(object):
                    init_kwargs[key] = value

        # Instantiate tokenizer.
-        tokenizer = cls(*init_inputs, **init_kwargs)
+        try:
+            tokenizer = cls(*init_inputs, **init_kwargs)
+        except OSError:
+            OSError("Unable to load vocabulary from file. "
+                    "Please check that the provided vocabulary is accessible and not corrupted.")

        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
        tokenizer.init_inputs = init_inputs

--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -40,8 +40,12 @@ PRETRAINED_VOCAB_FILES_MAP = {
 }

 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'xlm-roberta-base': None,
-    'xlm-roberta-large': None,
+    'xlm-roberta-base': 512,
+    'xlm-roberta-large': 512,
+    'xlm-roberta-large-finetuned-conll02-dutch': 512,
+    'xlm-roberta-large-finetuned-conll02-spanish': 512,
+    'xlm-roberta-large-finetuned-conll03-english': 512,
+    'xlm-roberta-large-finetuned-conll03-german': 512,
 }

 class XLMRobertaTokenizer(PreTrainedTokenizer):
@@ -58,10 +62,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
                 **kwargs):
-        super(XLMRobertaTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                                 sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
-                                                 mask_token=mask_token,
-                                                 **kwargs)
+        super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                                  sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                                  mask_token=mask_token,
+                                                  **kwargs)
        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
        self.sp_model = spm.SentencePieceProcessor()