Merge branch 'master' into resumable_http

5340d1f2 · Thomas Wolf · GitHub · 0e4cc050 · 10bd1ddb · 5340d1f2
Unverified Commit 5340d1f2 authored Nov 27, 2019 by Thomas Wolf Committed by GitHub Nov 27, 2019
20 changed files
--- a/examples/utils_summarization_test.py
+++ b/examples/utils_summarization_test.py
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+
+from utils_summarization import (
+    compute_token_type_ids,
+    fit_to_block_size,
+    build_mask,
+    build_lm_labels,
+    process_story,
+)
+
+
+class SummarizationDataProcessingTest(unittest.TestCase):
+    def setUp(self):
+        self.block_size = 10
+
+    def test_fit_to_block_sequence_too_small(self):
+        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
+        sequence = [1, 2, 3, 4]
+        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_fit_to_block_sequence_fit_exactly(self):
+        """ Do nothing if the sequence is the right size. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_fit_to_block_sequence_too_big(self):
+        """ Truncate the sequence if it is too long. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_process_story_no_highlights(self):
+        """ Processing a story with no highlights returns an empty list for the summary.
+        """
+        raw_story = """It was the year of Our Lord one thousand seven hundred and
+        seventy-five.\n\nSpiritual revelations were conceded to England at that
+        favoured period, as at this."""
+        _, summary_lines = process_story(raw_story)
+        self.assertEqual(summary_lines, [])
+
+    def test_process_empty_story(self):
+        """ An empty story returns an empty collection of lines.
+        """
+        raw_story = ""
+        story_lines, summary_lines = process_story(raw_story)
+        self.assertEqual(story_lines, [])
+        self.assertEqual(summary_lines, [])
+
+    def test_process_story_with_missing_period(self):
+        raw_story = (
+            "It was the year of Our Lord one thousand seven hundred and "
+            "seventy-five\n\nSpiritual revelations were conceded to England "
+            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
+        )
+        story_lines, summary_lines = process_story(raw_story)
+
+        expected_story_lines = [
+            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
+            "Spiritual revelations were conceded to England at that favoured period, as at this.",
+        ]
+        self.assertEqual(expected_story_lines, story_lines)
+
+        expected_summary_lines = ["It was the best of times."]
+        self.assertEqual(expected_summary_lines, summary_lines)
+
+    def test_build_lm_labels_no_padding(self):
+        sequence = torch.tensor([1, 2, 3, 4])
+        expected = sequence
+        np.testing.assert_array_equal(
+            build_lm_labels(sequence, 0).numpy(), expected.numpy()
+        )
+
+    def test_build_lm_labels(self):
+        sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
+        expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
+        np.testing.assert_array_equal(
+            build_lm_labels(sequence, 0).numpy(), expected.numpy()
+        )
+
+    def test_build_mask_no_padding(self):
+        sequence = torch.tensor([1, 2, 3, 4])
+        expected = torch.tensor([1, 1, 1, 1])
+        np.testing.assert_array_equal(build_mask(sequence, 0).numpy(), expected.numpy())
+
+    def test_build_mask(self):
+        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(
+            build_mask(sequence, 23).numpy(), expected.numpy()
+        )
+
+    def test_build_mask_with_padding_equal_to_one(self):
+        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(build_mask(sequence, 1).numpy(), expected.numpy())
+
+    def test_compute_token_type_ids(self):
+        separator = 101
+        batch = torch.tensor(
+            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
+        )
+        expected = torch.tensor(
+            [[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
+        )
+
+        result = compute_token_type_ids(batch, separator)
+        np.testing.assert_array_equal(result, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ from setuptools import find_packages, setup

 setup(
    name="transformers",
-    version="2.1.1",
+    version="2.2.0",
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",

--- a/templates/adding_a_new_example_script/README.md
+++ b/templates/adding_a_new_example_script/README.md
+# How to add a new example script in 🤗Transformers
+
+This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
+
+Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
--- a/templates/adding_a_new_example_script/run_xxx.py
+++ b/templates/adding_a_new_example_script/run_xxx.py
--- a/templates/adding_a_new_example_script/utils_xxx.py
+++ b/templates/adding_a_new_example_script/utils_xxx.py
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
+# How to add a new model in 🤗Transformers
+
+This folder describes the process to add a new model in 🤗Transformers and provide templates for the required files.
+
+The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in the present repository.
+
+One important point though is that the library has the following goals impacting the way models are incorporated:
+
+- one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
+- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
+
+For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).
+
+# Typical workflow for including a model
+
+Here an overview of the general workflow: 
+
+- [ ] add model/configuration/tokenization classes
+- [ ] add conversion scripts
+- [ ] add tests
+- [ ] finalize
+
+Let's detail what should be done at each step
+
+## Adding model/configuration/tokenization classes
+
+Here is the workflow for adding model/configuration/tokenization classes:
+
+- [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
+- [ ] edit the files to replace `XXX` (with various casing) with your model name
+- [ ] copy-paste or create a simple configuration class for your model in the `configuration_...` file
+- [ ] copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
+- [ ] copy-paste or create a tokenizer class for your model in the `tokenization_...` file
+
+# Adding conversion scripts
+
+Here is the workflow for the conversion scripts:
+
+- [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
+- [ ] edit this script to convert your original checkpoint weights to the current pytorch ones.
+
+# Adding tests:
+
+Here is the workflow for the adding tests:
+
+- [ ] copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main folder and rename them, replacing `xxx` with your model name,
+- [ ] edit the tests files to replace `XXX` (with various casing) with your model name
+- [ ] edit the tests code as needed
+
+# Final steps
+
+You can then finish the addition step by adding imports for your classes in the common files:
+
+- [ ] add import for all the relevant classes in `__init__.py`
+- [ ] add your configuration in `configuration_auto.py`
+- [ ] add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`
+- [ ] add your tokenizer in `tokenization_auto.py`
+- [ ] add your models and tokenizer to `pipeline.py`
+- [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future)
+- [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
+- [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`.
+- [ ] upload the pretrained weigths, configurations and vocabulary files.
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
--- a/templates/adding_a_new_model/tests/tokenization_xxx_test.py
+++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
--- a/transformers/configuration_camembert.py
+++ b/transformers/configuration_camembert.py
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -29,6 +29,7 @@ logger = logging.getLogger(__name__)
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+                                      "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json",
                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}

 class GPT2Config(PretrainedConfig):