add new model of MGP-STR (#21418)

* add new model of MGP-STR * fix the check failings * remove torch and numpy from mgp_tokenization * remove unused import from modeling_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str.py * add test_processing_mgp_str * add test_processing_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str and add softmax outs to model * rm test_processing_mgp_str and add softmax outs to model * rewrite the code of mgp-str according to PR suggestions * rewrite the code of mgp-str according to PR suggestions * add new model of MGP-STR * fix the check failings * remove torch and numpy from mgp_tokenization * remove unused import from modeling_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str.py * add test_processing_mgp_str * add test_processing_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str and add softmax outs to model * rewrite the code of mgp-str according to PR suggestions * rewrite the code of mgp-str according to PR suggestions * remove representation_size from MGPSTRConfig * reformat configuration_mgp_str.py * format test_processor_mgp_str.py * add test for tokenizer and complete model/processer test and model file * rm Unnecessary tupple in modeling_mgp_str * reduce hidden_size/layers/label_size in test_model * add integration tests and change MGPSTR to Mgpstr * add test for logit values * reformat test model file --------- Co-authored-by: yue kun <yuekun.wp@alibaba-inc.com>

add new model of MGP-STR (#21418)
* add new model of MGP-STR * fix the check failings * remove torch and numpy from mgp_tokenization * remove unused import from modeling_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str.py * add test_processing_mgp_str * add test_processing_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str and add softmax outs to model * rm test_processing_mgp_str and add softmax outs to model * rewrite the code of mgp-str according to PR suggestions * rewrite the code of mgp-str according to PR suggestions * add new model of MGP-STR * fix the check failings * remove torch and numpy from mgp_tokenization * remove unused import from modeling_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str.py * add test_processing_mgp_str * add test_processing_mgp_str * add test_processing_mgp_str * rm test_processing_mgp_str and add softmax outs to model * rewrite the code of mgp-str according to PR suggestions * rewrite the code of mgp-str according to PR suggestions * remove representation_size from MGPSTRConfig * reformat configuration_mgp_str.py * format test_processor_mgp_str.py * add test for tokenizer and complete model/processer test and model file * rm Unnecessary tupple in modeling_mgp_str * reduce hidden_size/layers/label_size in test_model * add integration tests and change MGPSTR to Mgpstr * add test for logit values * reformat test model file --------- Co-authored-by: yue kun <yuekun.wp@alibaba-inc.com>
102b5ff4 · wangpeng · GitHub · 32e3466d · 102b5ff4 · 102b5ff4
Unverified Commit 102b5ff4 authored Mar 13, 2023 by wangpeng Committed by GitHub Mar 13, 2023
8 changed files
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for MGP-STR."""
+import warnings
+from transformers import AutoTokenizer
+from transformers.utils import is_torch_available
+from transformers.utils.generic import ExplicitEnum
+from ...processing_utils import ProcessorMixin
+if is_torch_available():
+    import torch
+class DecodeType(ExplicitEnum):
+    CHARACTER = "char"
+    BPE = "bpe"
+    WORDPIECE = "wp"
+SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE)
+class MgpstrProcessor(ProcessorMixin):
+    r"""
+    Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
+    [`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the
+    [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
+    Args:
+        image_processor (`ViTImageProcessor`):
+            An instance of `ViTImageProcessor`. The image processor is a required input.
+        tokenizer ([`MgpstrTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "char_tokenizer"]
+    image_processor_class = "ViTImageProcessor"
+    char_tokenizer_class = "MgpstrTokenizer"
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        self.char_tokenizer = tokenizer
+        self.bpe_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        self.wp_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        super().__init__(image_processor, tokenizer)
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
+        [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
+        arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
+        refer to the doctsring of the above methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+        if images is not None:
+            inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if text is not None:
+            encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs)
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+    def batch_decode(self, sequences):
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `Dict[str, any]`: Dictionary of all the outputs of the decoded results.
+                generated_text (`List[str]`): The final results after fusion of char, bpe, and wp. scores
+                (`List[float]`): The final scores after fusion of char, bpe, and wp. char_preds (`List[str]`): The list
+                of character decoded sentences. bpe_preds (`List[str]`): The list of bpe decoded sentences. wp_preds
+                (`List[str]`): The list of wp decoded sentences.
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        char_preds, bpe_preds, wp_preds = sequences
+        batch_size = char_preds.size(0)
+        char_strs, char_scores = self._decode_helper(char_preds, "char")
+        bpe_strs, bpe_scores = self._decode_helper(bpe_preds, "bpe")
+        wp_strs, wp_scores = self._decode_helper(wp_preds, "wp")
+        final_strs = []
+        final_scores = []
+        for i in range(batch_size):
+            scores = [char_scores[i], bpe_scores[i], wp_scores[i]]
+            strs = [char_strs[i], bpe_strs[i], wp_strs[i]]
+            max_score_index = scores.index(max(scores))
+            final_strs.append(strs[max_score_index])
+            final_scores.append(scores[max_score_index])
+        out = {}
+        out["generated_text"] = final_strs
+        out["scores"] = final_scores
+        out["char_preds"] = char_strs
+        out["bpe_preds"] = bpe_strs
+        out["wp_preds"] = wp_strs
+        return out
+    def _decode_helper(self, pred_logits, format):
+        """
+        Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
+        Args:
+            pred_logits (`torch.Tensor`):
+                List of model prediction logits.
+            format (`Union[DecoderType, str]`):
+                Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
+        Returns:
+            `tuple`:
+                dec_strs(`str`): The decode strings of model prediction. conf_scores(`List[float]`): The confidence
+                score of model prediction.
+        """
+        if format == DecodeType.CHARACTER:
+            decoder = self.char_decode
+            eos_token = 1
+            eos_str = "[s]"
+        elif format == DecodeType.BPE:
+            decoder = self.bpe_decode
+            eos_token = 2
+            eos_str = "#"
+        elif format == DecodeType.WORDPIECE:
+            decoder = self.wp_decode
+            eos_token = 102
+            eos_str = "[SEP]"
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        dec_strs, conf_scores = [], []
+        batch_size = pred_logits.size(0)
+        batch_max_length = pred_logits.size(1)
+        _, preds_index = pred_logits.topk(1, dim=-1, largest=True, sorted=True)
+        preds_index = preds_index.view(-1, batch_max_length)[:, 1:]
+        preds_str = decoder(preds_index)
+        preds_max_prob, _ = torch.nn.functional.softmax(pred_logits, dim=2).max(dim=2)
+        preds_max_prob = preds_max_prob[:, 1:]
+        for index in range(batch_size):
+            pred_eos = preds_str[index].find(eos_str)
+            pred = preds_str[index][:pred_eos]
+            pred_index = preds_index[index].cpu().tolist()
+            pred_eos_index = pred_index.index(eos_token) if eos_token in pred_index else -1
+            pred_max_prob = preds_max_prob[index][: pred_eos_index + 1]
+            confidence_score = pred_max_prob.cumprod(dim=0)[-1] if pred_max_prob.nelement() != 0 else 0.0
+            dec_strs.append(pred)
+            conf_scores.append(confidence_score)
+        return dec_strs, conf_scores
+    def char_decode(self, sequences):
+        """
+        Convert a list of lists of char token ids into a list of strings by calling char tokenizer.
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `List[str]`: The list of char decoded sentences.
+        """
+        decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)]
+        return decode_strs
+    def bpe_decode(self, sequences):
+        """
+        Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `List[str]`: The list of bpe decoded sentences.
+        """
+        return self.bpe_tokenizer.batch_decode(sequences)
+    def wp_decode(self, sequences):
+        """
+        Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer.
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `List[str]`: The list of wp decoded sentences.
+        """
+        decode_strs = [seq.replace(" ", "") for seq in self.wp_tokenizer.batch_decode(sequences)]
+        return decode_strs
--- a/src/transformers/models/mgp_str/tokenization_mgp_str.py
+++ b/src/transformers/models/mgp_str/tokenization_mgp_str.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MGT-STR CHAR."""
+import json
+import os
+from typing import Optional, Tuple
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "mgp-str": "https://huggingface.co/alibaba-damo/mgp-str-base/blob/main/vocab.json",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mgp-str": 27}
+class MgpstrTokenizer(PreTrainedTokenizer):
+    """
+    Construct a MGP-STR char tokenizer.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str`, *optional*, defaults to `"[GO]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"[GO]"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"[s]"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, , defaults to `"[GO]"`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs):
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.vocab = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.vocab.items()}
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        char_tokens = []
+        for s in text:
+            char_tokens.extend(s)
+        return char_tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        return (vocab_file,)
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4219,6 +4219,30 @@ class MegatronBertPreTrainedModel(metaclass=DummyObject):
        requires_backends(self, ["torch"])
+MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class MgpstrForSceneTextRecognition(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class MgpstrModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class MgpstrPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 class MMBTForClassification(metaclass=DummyObject):
    _backends = ["torch"]

--- a/tests/models/mgp_str/__init__.py
+++ b/tests/models/mgp_str/__init__.py
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MGP-STR model. """
+import inspect
+import unittest
+import requests
+from transformers import MgpstrConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import MgpstrForSceneTextRecognition
+if is_vision_available():
+    from PIL import Image
+    from transformers import MgpstrProcessor
+class MgpstrModelTester:
+    def __init__(
+        self,
+        parent,
+        is_training=False,
+        batch_size=13,
+        image_size=(32, 128),
+        patch_size=4,
+        num_channels=3,
+        max_token_length=27,
+        num_character_labels=38,
+        num_bpe_labels=99,
+        num_wordpiece_labels=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        mlp_ratio=4.0,
+        patch_embeds_hidden_size=257,
+        output_hidden_states=None,
+    ):
+        self.parent = parent
+        self.is_training = is_training
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.max_token_length = max_token_length
+        self.num_character_labels = num_character_labels
+        self.num_bpe_labels = num_bpe_labels
+        self.num_wordpiece_labels = num_wordpiece_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.output_hidden_states = output_hidden_states
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
+        config = self.get_config()
+        return config, pixel_values
+    def get_config(self):
+        return MgpstrConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            max_token_length=self.max_token_length,
+            num_character_labels=self.num_character_labels,
+            num_bpe_labels=self.num_bpe_labels,
+            num_wordpiece_labels=self.num_wordpiece_labels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            mlp_ratio=self.mlp_ratio,
+            output_hidden_states=self.output_hidden_states,
+        )
+    def create_and_check_model(self, config, pixel_values):
+        model = MgpstrForSceneTextRecognition(config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            generated_ids = model(pixel_values)
+        self.parent.assertEqual(
+            generated_ids[0][0].shape, (self.batch_size, self.max_token_length, self.num_character_labels)
+        )
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+@require_torch
+class MgpstrModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (MgpstrForSceneTextRecognition,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    test_attention_outputs = False
+    def setUp(self):
+        self.model_tester = MgpstrModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MgpstrConfig, has_text_modality=False)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @unittest.skip(reason="MgpstrModel does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    @unittest.skip(reason="MgpstrModel does not support feedforward chunking")
+    def test_feed_forward_chunking(self):
+        pass
+    def test_gradient_checkpointing_backward_compatibility(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
+            config.gradient_checkpointing = True
+            model = model_class(config)
+            self.assertTrue(model.is_gradient_checkpointing)
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            hidden_states = outputs.hidden_states
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [self.model_tester.patch_embeds_hidden_size, self.model_tester.hidden_size],
+            )
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+    # override as the `logit_scale` parameter initilization is different for MgpstrModel
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if isinstance(param, (nn.Linear, nn.Conv2d, nn.LayerNorm)):
+                    if param.requires_grad:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+    @unittest.skip(reason="Retain_grad is tested in individual model tests")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+# We will verify our results on an image from the IIIT-5k dataset
+def prepare_img():
+    url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
+    im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    return im
+@require_vision
+@require_torch
+class MgpstrModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference(self):
+        model_name = "alibaba-damo/mgp-str-base"
+        model = MgpstrForSceneTextRecognition.from_pretrained(model_name).to(torch_device)
+        processor = MgpstrProcessor.from_pretrained(model_name)
+        image = prepare_img()
+        inputs = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(inputs)
+        # verify the logits
+        self.assertEqual(outputs.logits[0].shape, torch.Size((1, 27, 38)))
+        out_strs = processor.batch_decode(outputs.logits)
+        expected_text = "ticket"
+        self.assertEqual(out_strs["generated_text"][0], expected_text)
+        expected_slice = torch.tensor(
+            [[[-39.7358, -44.8562, -36.6253], [-62.3605, -64.5908, -59.0069], [-74.6127, -68.9724, -71.7150]]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(outputs.logits[0][:, 1:4, 1:4], expected_slice, atol=1e-4))
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the MgpstrProcessor. """
+import json
+import os
+import shutil
+import tempfile
+import unittest
+import numpy as np
+import pytest
+from transformers import MgpstrTokenizer
+from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
+if is_torch_available():
+    import torch
+if is_vision_available():
+    from PIL import Image
+    from transformers import MgpstrProcessor, ViTImageProcessor
+@require_torch
+@require_vision
+class MgpstrProcessorTest(unittest.TestCase):
+    image_processing_class = ViTImageProcessor if is_vision_available() else None
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+    def setUp(self):
+        self.image_size = (3, 32, 128)
+        self.tmpdirname = tempfile.mkdtemp()
+        # fmt: off
+        vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        image_processor_map = {
+            "do_normalize": False,
+            "do_resize": True,
+            "feature_extractor_type": "ViTFeatureExtractor",
+            "resample": 3,
+            "size": {"height": 32, "width": 128},
+        }
+        self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
+        with open(self.image_processor_file, "w", encoding="utf-8") as fp:
+            json.dump(image_processor_map, fp)
+    def get_tokenizer(self, **kwargs):
+        return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    def get_image_processor(self, **kwargs):
+        return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+    def prepare_image_inputs(self):
+        """This function prepares a list of PIL images."""
+        image_input = np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)
+        image_input = Image.fromarray(np.moveaxis(image_input, 0, -1))
+        return image_input
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        image_processor = self.get_image_processor()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor.save_pretrained(self.tmpdirname)
+        processor = MgpstrProcessor.from_pretrained(self.tmpdirname, use_fast=False)
+        self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
+        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
+    def test_save_load_pretrained_additional_features(self):
+        tokenizer = self.get_tokenizer()
+        image_processor = self.get_image_processor()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor.save_pretrained(self.tmpdirname)
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
+        processor = MgpstrProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+        self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
+        self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.image_processor, ViTImageProcessor)
+    def test_image_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        image_input = self.prepare_image_inputs()
+        input_image_proc = image_processor(image_input, return_tensors="np")
+        input_processor = processor(images=image_input, return_tensors="np")
+        for key in input_image_proc.keys():
+            self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
+    def test_tokenizer(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "test"
+        encoded_processor = processor(text=input_str)
+        encoded_tok = tokenizer(input_str)
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+    def test_processor(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = "test"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "labels"])
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+    def test_tokenizer_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9], [3, 4, 3, 1, 1, 8, 9]]
+        decoded_processor = processor.char_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+        decode_strs = [seq.replace(" ", "") for seq in decoded_tok]
+        self.assertListEqual(decode_strs, decoded_processor)
+    def test_model_input_names(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        input_str = None
+        image_input = self.prepare_image_inputs()
+        inputs = processor(text=input_str, images=image_input)
+        self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+    def test_processor_batch_decode(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+        processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        char_input = torch.randn(1, 27, 38)
+        bpe_input = torch.randn(1, 27, 50257)
+        wp_input = torch.randn(1, 27, 30522)
+        results = processor.batch_decode([char_input, bpe_input, wp_input])
+        self.assertListEqual(list(results.keys()), ["generated_text", "scores", "char_preds", "bpe_preds", "wp_preds"])
--- a/tests/models/mgp_str/test_tokenization_mgp_str.py
+++ b/tests/models/mgp_str/test_tokenization_mgp_str.py
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import unittest
+from transformers import MgpstrTokenizer
+from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
+from transformers.testing_utils import require_tokenizers
+from ...test_tokenization_common import TokenizerTesterMixin
+@require_tokenizers
+class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = MgpstrTokenizer
+    test_rust_tokenizer = False
+    from_pretrained_kwargs = {}
+    test_seq2seq = False
+    def setUp(self):
+        super().setUp()
+        # fmt: off
+        vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+        # fmt: on
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+    def get_tokenizer(self, **kwargs):
+        return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    def get_input_output_texts(self, tokenizer):
+        input_text = "tester"
+        output_text = "tester"
+        return input_text, output_text
+    @unittest.skip("MGP-STR always lower cases letters.")
+    def test_added_tokens_do_lower_case(self):
+        pass
+    def test_add_special_tokens(self):
+        tokenizers = self.get_tokenizers(do_lower_case=False)
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                special_token = "[SPECIAL_TOKEN]"
+                tokenizer.add_special_tokens({"cls_token": special_token})
+                encoded_special_token = tokenizer.encode([special_token], add_special_tokens=False)
+                self.assertEqual(len(encoded_special_token), 1)
+                decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
+                self.assertTrue(special_token not in decoded)
+    def test_internal_consistency(self):
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                input_text, output_text = self.get_input_output_texts(tokenizer)
+                tokens = tokenizer.tokenize(input_text)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
+                self.assertListEqual(ids, ids_2)
+                tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+                self.assertNotEqual(len(tokens_2), 0)
+                text_2 = tokenizer.decode(ids)
+                self.assertIsInstance(text_2, str)
+                self.assertEqual(text_2.replace(" ", ""), output_text)
+    @unittest.skip("MGP-STR tokenizer only handles one sequence.")
+    def test_maximum_encoding_length_pair_input(self):
+        pass
+    @unittest.skip("inputs cannot be pretokenized in MgpstrTokenizer")
+    def test_pretokenized_inputs(self):
+        pass
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -95,6 +95,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
    "M2M100Encoder",  # Building part of bigger (tested) model.
    "M2M100Decoder",  # Building part of bigger (tested) model.
    "MCTCTEncoder",  # Building part of bigger (tested) model.
+    "MgpstrModel",  # Building part of bigger (tested) model.
    "Speech2TextEncoder",  # Building part of bigger (tested) model.
    "Speech2TextDecoder",  # Building part of bigger (tested) model.
    "LEDEncoder",  # Building part of bigger (tested) model.
@@ -269,6 +270,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
    "LukeForEntityClassification",
    "LukeForEntityPairClassification",
    "LukeForEntitySpanClassification",
+    "MgpstrModel",
    "OpenAIGPTDoubleHeadsModel",
    "OwlViTTextModel",
    "OwlViTVisionModel",