Add MarkupLM (#19198)

* First draft * Make basic test work * Fix most tokenizer tests * More improvements * Make more tests pass * Fix more tests * Fix some code quality * Improve truncation * Implement feature extractor * Improve feature extractor and add tests * Improve feature extractor tests * Fix pair_input test partly * Add fast tokenizer * Improve implementation * Fix rebase * Fix rebase * Fix most of the tokenizer tests. * propose solution for fast * add: integration test for fasttokenizer, warning for decode, fix template in slow tokenizer * add: modify markuplmconverter * add: some modify on converter and tokenizerfast * Fix style, copies * Make fixup * Update tokenization_markuplm.py * Update test_tokenization_markuplm.py * Update markuplm related * Improve processor, add integration test * Add processor test file * Improve processor * Improve processor tests * Fix more processor tests * Fix processor tests * Update docstrings * Add Copied from statements * Add more Copied from statements * Add code examples * Improve code examples * Add model to doc tests * Adding dependency check * Add dummy file * Add requires_backends * Add model to toctree * Fix more things, disable dependency check for now * Apply more suggestions * Add soft dependency * Add annotators to tests * Fix style * Remove from_slow=True * Remove print statements * Add sanity check * Fix processor test * Fix processor tests, add more docs * Add doc tests for mdx file * Add more tips * Apply suggestions Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local> Co-authored-by: lockon-n <45759388+lockon-n@users.noreply.github.com> Co-authored-by: SaulLu <lucilesaul.com@gmail.com> Co-authored-by: lockon-n <dd098309@126.com>

Add MarkupLM (#19198)
* First draft * Make basic test work * Fix most tokenizer tests * More improvements * Make more tests pass * Fix more tests * Fix some code quality * Improve truncation * Implement feature extractor * Improve feature extractor and add tests * Improve feature extractor tests * Fix pair_input test partly * Add fast tokenizer * Improve implementation * Fix rebase * Fix rebase * Fix most of the tokenizer tests. * propose solution for fast * add: integration test for fasttokenizer, warning for decode, fix template in slow tokenizer * add: modify markuplmconverter * add: some modify on converter and tokenizerfast * Fix style, copies * Make fixup * Update tokenization_markuplm.py * Update test_tokenization_markuplm.py * Update markuplm related * Improve processor, add integration test * Add processor test file * Improve processor * Improve processor tests * Fix more processor tests * Fix processor tests * Update docstrings * Add Copied from statements * Add more Copied from statements * Add code examples * Improve code examples * Add model to doc tests * Adding dependency check * Add dummy file * Add requires_backends * Add model to toctree * Fix more things, disable dependency check for now * Apply more suggestions * Add soft dependency * Add annotators to tests * Fix style * Remove from_slow=True * Remove print statements * Add sanity check * Fix processor test * Fix processor tests, add more docs * Add doc tests for mdx file * Add more tips * Apply suggestions Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local> Co-authored-by: lockon-n <45759388+lockon-n@users.noreply.github.com> Co-authored-by: SaulLu <lucilesaul.com@gmail.com> Co-authored-by: lockon-n <dd098309@126.com>
f3d2f7a6 · NielsRogge · GitHub · 49d62b01 · f3d2f7a6 · f3d2f7a6
Unverified Commit f3d2f7a6 authored Sep 30, 2022 by NielsRogge Committed by GitHub Sep 30, 2022
12 changed files
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -46,6 +46,7 @@ from .utils import (
    is_accelerate_available,
    is_apex_available,
    is_bitsandbytes_available,
+    is_bs4_available,
    is_detectron2_available,
    is_faiss_available,
    is_flax_available,
@@ -239,6 +240,13 @@ def custom_tokenizers(test_case):
    return unittest.skipUnless(_run_custom_tokenizers, "test of custom tokenizers")(test_case)
+def require_bs4(test_case):
+    """
+    Decorator marking a test that requires BeautifulSoup4. These tests are skipped when BeautifulSoup4 isn't installed.
+    """
+    return unittest.skipUnless(is_bs4_available(), "test requires BeautifulSoup4")(test_case)
 def require_git_lfs(test_case):
    """
    Decorator marking a test that requires git-lfs.

--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -89,6 +89,7 @@ from .import_utils import (
    is_accelerate_available,
    is_apex_available,
    is_bitsandbytes_available,
+    is_bs4_available,
    is_coloredlogs_available,
    is_datasets_available,
    is_detectron2_available,

--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3020,6 +3020,44 @@ class MarianMTModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class MarkupLMForQuestionAnswering(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class MarkupLMForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class MarkupLMForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class MarkupLMModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class MarkupLMPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 MASKFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None

--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -234,6 +234,13 @@ class LxmertTokenizerFast(metaclass=DummyObject):
        requires_backends(self, ["tokenizers"])
+class MarkupLMTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
 class MBartTokenizerFast(metaclass=DummyObject):
    _backends = ["tokenizers"]

--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -386,6 +386,10 @@ def is_torch_fx_available():
    return _torch_fx_available
+def is_bs4_available():
+    return importlib.util.find_spec("bs4") is not None
 def is_torch_onnx_dict_inputs_support_available():
    return _torch_onnx_dict_inputs_support_available
@@ -748,6 +752,12 @@ If you really do want to use TensorFlow, please follow the instructions on the
 installation page https://www.tensorflow.org/install that match your environment.
 """
+# docstyle-ignore
+BS4_IMPORT_ERROR = """
+{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
+`pip install beautifulsoup4`
+"""
 # docstyle-ignore
 SKLEARN_IMPORT_ERROR = """
@@ -889,6 +899,7 @@ CCL_IMPORT_ERROR = """
 BACKENDS_MAPPING = OrderedDict(
    [
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
        ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),

--- a/tests/models/markuplm/__init__.py
+++ b/tests/models/markuplm/__init__.py
--- a/tests/models/markuplm/test_feature_extraction_markuplm.py
+++ b/tests/models/markuplm/test_feature_extraction_markuplm.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from transformers.testing_utils import require_bs4
+from transformers.utils import is_bs4_available
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
+if is_bs4_available():
+    from transformers import MarkupLMFeatureExtractor
+class MarkupLMFeatureExtractionTester(unittest.TestCase):
+    def __init__(self, parent):
+        self.parent = parent
+    def prepare_feat_extract_dict(self):
+        return {}
+def get_html_strings():
+    html_string_1 = """<HTML>
+    <HEAD>
+    <TITLE>sample document</TITLE>
+    </HEAD>
+    <BODY BGCOLOR="FFFFFF">
+    <HR>
+    <a href="http://google.com">Goog</a>
+    <H1>This is one header</H1>
+    <H2>This is a another Header</H2>
+    <P>Travel from
+        <P>
+        <B>SFO to JFK</B>
+        <BR>
+        <B><I>on May 2, 2015 at 2:00 pm. For details go to confirm.com </I></B>
+        <HR>
+        <div style="color:#0000FF">
+            <h3>Traveler <b> name </b> is
+            <p> John Doe </p>
+        </div>"""
+    html_string_2 = """
+    <!DOCTYPE html>
+    <html>
+    <body>
+    <h1>My First Heading</h1>
+    <p>My first paragraph.</p>
+    </body>
+    </html>
+    """
+    return [html_string_1, html_string_2]
+@require_bs4
+class MarkupLMFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+    feature_extraction_class = MarkupLMFeatureExtractor if is_bs4_available() else None
+    def setUp(self):
+        self.feature_extract_tester = MarkupLMFeatureExtractionTester(self)
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+    def test_call(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class()
+        # Test not batched input
+        html_string = get_html_strings()[0]
+        encoding = feature_extractor(html_string)
+        # fmt: off
+        expected_nodes = [['sample document', 'Goog', 'This is one header', 'This is a another Header', 'Travel from', 'SFO to JFK', 'on May 2, 2015 at 2:00 pm. For details go to confirm.com', 'Traveler', 'name', 'is', 'John Doe']]
+        expected_xpaths = [['/html/head/title', '/html/body/a', '/html/body/h1', '/html/body/h2', '/html/body/p', '/html/body/p/p/b[1]', '/html/body/p/p/b[2]/i', '/html/body/p/p/div/h3', '/html/body/p/p/div/h3/b', '/html/body/p/p/div/h3', '/html/body/p/p/div/h3/p']]
+        # fmt: on
+        self.assertEqual(encoding.nodes, expected_nodes)
+        self.assertEqual(encoding.xpaths, expected_xpaths)
+        # Test batched
+        html_strings = get_html_strings()
+        encoding = feature_extractor(html_strings)
+        # fmt: off
+        expected_nodes = expected_nodes + [['My First Heading', 'My first paragraph.']]
+        expected_xpaths = expected_xpaths + [['/html/body/h1', '/html/body/p']]
+        self.assertEqual(len(encoding.nodes), 2)
+        self.assertEqual(len(encoding.xpaths), 2)
+        self.assertEqual(encoding.nodes, expected_nodes)
+        self.assertEqual(encoding.xpaths, expected_xpaths)
--- a/tests/models/markuplm/test_modeling_markuplm.py
+++ b/tests/models/markuplm/test_modeling_markuplm.py
--- a/tests/models/markuplm/test_processor_markuplm.py
+++ b/tests/models/markuplm/test_processor_markuplm.py
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -3,6 +3,7 @@ docs/source/es/quicktour.mdx
 docs/source/en/pipeline_tutorial.mdx
 docs/source/en/autoclass_tutorial.mdx
 docs/source/en/task_summary.mdx
+docs/source/en/model_doc/markuplm.mdx
 docs/source/en/model_doc/speech_to_text.mdx
 docs/source/en/model_doc/t5.mdx
 docs/source/en/model_doc/t5v1.1.mdx
@@ -51,6 +52,7 @@ src/transformers/models/longformer/modeling_longformer.py
 src/transformers/models/longformer/modeling_tf_longformer.py
 src/transformers/models/longt5/modeling_longt5.py
 src/transformers/models/marian/modeling_marian.py
+src/transformers/models/markuplm/modeling_markuplm.py
 src/transformers/models/mbart/modeling_mbart.py
 src/transformers/models/mobilebert/modeling_mobilebert.py
 src/transformers/models/mobilebert/modeling_tf_mobilebert.py