add model cards cc @mfuntowicz

56e98ba8 · thomwolf · 8669598a · 56e98ba8 · 56e98ba8 · 56e98ba8
Commit 56e98ba8 authored Dec 16, 2019 by thomwolf
4 changed files
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -33,6 +33,9 @@ from .data import (is_sklearn_available,
 if is_sklearn_available():
    from .data import glue_compute_metrics, xnli_compute_metrics

+# Model Cards
+from .model_card import ModelCard
+
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer

--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -72,7 +72,7 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
-
+MODEL_CARD_NAME = "model_card.json"

 DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]

--- a/transformers/model_card.py
+++ b/transformers/model_card.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+import re
+from io import open
+
+from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url
+
+
+logger = logging.getLogger(__name__)
+
+
+ALL_MODELS_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
+class ModelCard(object):
+    r""" Model Card class.
+        Store model card as well as methods for loading/downloading/saving model cards.
+
+        Please read the following paper for details and explanation on the sections:
+            "Model Cards for Model Reporting"
+                by Margaret Mitchell, Simone Wu,
+                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+            Link: https://arxiv.org/abs/1810.03993
+
+        Note:
+            A model card can be loaded and saved to disk.
+
+        Parameters:
+    """
+    def __init__(self, **kwargs):
+        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        self.model_details = kwargs.pop('model_details', {})
+        self.intended_use = kwargs.pop('intended_use', {})
+        self.factors = kwargs.pop('factors', {})
+        self.metrics = kwargs.pop('metrics', {})
+        self.evaluation_data = kwargs.pop('evaluation_data', {})
+        self.training_data = kwargs.pop('training_data', {})
+        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
+        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
+        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+
+        # Open additional attributes
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
+    def save_pretrained(self, save_directory):
+        """ Save a model card object to the directory `save_directory`, so that it
+            can be re-loaded using the :func:`~transformers.ModelCard.from_pretrained` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model card can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_card_file = os.path.join(save_directory, MODEL_CARD_NAME)
+
+        self.to_json_file(output_model_card_file)
+        logger.info("Model card saved in {}".format(output_model_card_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                card should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+
+                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model card file and override the cached version if it exists.
+
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final model card object.
+                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+
+        Examples::
+
+            model_card = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
+            model_card = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
+            model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json')
+            model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
+        proxies = kwargs.pop('proxies', None)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in ALL_MODELS_MAP:
+            model_card_file = ALL_MODELS_MAP[pretrained_model_name_or_path]
+            model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)  # For simplicity we use the same pretrained url than config but with a different suffix
+        elif os.path.isdir(pretrained_model_name_or_path):
+            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            model_card_file = pretrained_model_name_or_path
+        else:
+            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
+                                               proxies=proxies, resume_download=resume_download)
+
+            if resolved_model_card_file == model_card_file:
+                logger.info("loading model card file {}".format(model_card_file))
+            else:
+                logger.info("loading model card file {} from cache at {}".format(
+                    model_card_file, resolved_model_card_file))
+
+            # Load model card
+            model_card = cls.from_json_file(resolved_model_card_file)
+
+        except EnvironmentError:
+            if pretrained_model_name_or_path in ALL_MODELS_MAP:
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
+                        model_card_file))
+            else:
+                logger.warning("Model name '{}' was not found in model name list ({}). " \
+                      "We assumed '{}' was a path or url to a model card file named {} or " \
+                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(ALL_MODELS_MAP.keys()),
+                        model_card_file, MODEL_CARD_NAME))
+
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            model_card = cls()
+
+        # Update model card with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(model_card, key):
+                setattr(model_card, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model card: %s", str(model_card))
+        if return_unused_kwargs:
+            return model_card, kwargs
+        else:
+            return model_card
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelCard` from a Python dictionary of parameters."""
+        return cls(**json_object)
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelCard` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import sys
+import json
+import tempfile
+import shutil
+import unittest
+
+from transformers.model_card import ModelCard
+from .tokenization_tests_commons import TemporaryDirectory
+
+class ModelCardTester(unittest.TestCase):
+
+    def setUp(self):
+        self.inputs_dict = {'model_details': {
+                                'Organization': 'testing',
+                                'Model date': 'today',
+                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
+                                'Architecture': 'Convolutional Neural Network.',
+                                },
+                            'metrics': 'BLEU and ROUGE-1',
+                            'evaluation_data':{
+                                'Datasets':{
+                                    'BLEU': 'My-great-dataset-v1',
+                                    'ROUGE-1': 'My-short-dataset-v2.1',
+                                },
+                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'training_data':{
+                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
+                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'quantitative_analyses': {
+                                'BLEU': 55.1,
+                                'ROUGE-1': 76,
+                            },
+                            }
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_model_card_common_properties(self):
+        model_card = ModelCard.from_dict(self.inputs_dict)
+        self.assertTrue(hasattr(model_card, 'model_details'))
+        self.assertTrue(hasattr(model_card, 'intended_use'))
+        self.assertTrue(hasattr(model_card, 'factors'))
+        self.assertTrue(hasattr(model_card, 'metrics'))
+        self.assertTrue(hasattr(model_card, 'evaluation_data'))
+        self.assertTrue(hasattr(model_card, 'training_data'))
+        self.assertTrue(hasattr(model_card, 'quantitative_analyses'))
+        self.assertTrue(hasattr(model_card, 'ethical_considerations'))
+        self.assertTrue(hasattr(model_card, 'caveats_and_recommendations'))
+
+    def test_model_card_to_json_string(self):
+        model_card = ModelCard.from_dict(self.inputs_dict)
+        obj = json.loads(model_card.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_model_card_to_json_file(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            filename = os.path.join(tmpdirname, u"model_card.json")
+            model_card_first.to_json_file(filename)
+            model_card_second = ModelCard.from_json_file(filename)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+if __name__ == "__main__":
+    unittest.main()